diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 1 | ||||
-rw-r--r-- | mm/dmapool.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 14 | ||||
-rw-r--r-- | mm/huge_memory.c | 6 | ||||
-rw-r--r-- | mm/hugetlb.c | 41 | ||||
-rw-r--r-- | mm/memblock.c | 8 | ||||
-rw-r--r-- | mm/memory.c | 125 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 68 | ||||
-rw-r--r-- | mm/mmap.c | 34 | ||||
-rw-r--r-- | mm/nommu.c | 34 | ||||
-rw-r--r-- | mm/oom_kill.c | 2 | ||||
-rw-r--r-- | mm/page-writeback.c | 11 | ||||
-rw-r--r-- | mm/page_alloc.c | 58 | ||||
-rw-r--r-- | mm/page_cgroup.c | 10 | ||||
-rw-r--r-- | mm/pagewalk.c | 49 | ||||
-rw-r--r-- | mm/shmem.c | 552 | ||||
-rw-r--r-- | mm/sparse.c | 2 | ||||
-rw-r--r-- | mm/thrash.c | 17 | ||||
-rw-r--r-- | mm/truncate.c | 143 |
19 files changed, 695 insertions, 482 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 2ef0dc9..8290b1e 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -606,6 +606,7 @@ static void bdi_prune_sb(struct backing_dev_info *bdi) void bdi_unregister(struct backing_dev_info *bdi) { if (bdi->dev) { + bdi_set_min_ratio(bdi, 0); trace_writeback_bdi_unregister(bdi); bdi_prune_sb(bdi); del_timer_sync(&bdi->wb.wakeup_timer); diff --git a/mm/dmapool.c b/mm/dmapool.c index 03bf3bb..fbb58e3 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -500,7 +500,7 @@ void dmam_pool_destroy(struct dma_pool *pool) { struct device *dev = pool->dev; - dma_pool_destroy(pool); WARN_ON(devres_destroy(dev, dmam_pool_release, dmam_pool_match, pool)); + dma_pool_destroy(pool); } EXPORT_SYMBOL(dmam_pool_destroy); diff --git a/mm/filemap.c b/mm/filemap.c index f820e60..10a1711 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -128,6 +128,7 @@ void __delete_from_page_cache(struct page *page) radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; + /* Leave page->index set: truncation lookup relies upon it */ mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); if (PageSwapBacked(page)) @@ -483,6 +484,7 @@ int add_to_page_cache_locked(struct page *page, struct address_space *mapping, spin_unlock_irq(&mapping->tree_lock); } else { page->mapping = NULL; + /* Leave page->index set: truncation relies upon it */ spin_unlock_irq(&mapping->tree_lock); mem_cgroup_uncharge_cache_page(page); page_cache_release(page); @@ -1792,7 +1794,7 @@ EXPORT_SYMBOL(generic_file_readonly_mmap); static struct page *__read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data, gfp_t gfp) { @@ -1823,7 +1825,7 @@ repeat: static struct page *do_read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data, gfp_t gfp) @@ -1863,7 +1865,7 @@ out: * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read - * @data: destination for read data + * @data: first arg to filler(data, page) function, often left as NULL * * Same as read_cache_page, but don't wait for page to become unlocked * after submitting it to the filler. @@ -1875,7 +1877,7 @@ out: */ struct page *read_cache_page_async(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data) { return do_read_cache_page(mapping, index, filler, data, mapping_gfp_mask(mapping)); @@ -1923,7 +1925,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); * @mapping: the page's address_space * @index: the page index * @filler: function to perform the read - * @data: destination for read data + * @data: first arg to filler(data, page) function, often left as NULL * * Read into the page cache. If a page already exists, and PageUptodate() is * not set, try to fill the page then wait for it to become unlocked. @@ -1932,7 +1934,7 @@ EXPORT_SYMBOL(read_cache_page_gfp); */ struct page *read_cache_page(struct address_space *mapping, pgoff_t index, - int (*filler)(void *,struct page*), + int (*filler)(void *, struct page *), void *data) { return wait_on_page_read(read_cache_page_async(mapping, index, filler, data)); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 81532f29..e2d1587 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1596,14 +1596,13 @@ void __khugepaged_exit(struct mm_struct *mm) list_del(&mm_slot->mm_node); free = 1; } + spin_unlock(&khugepaged_mm_lock); if (free) { - spin_unlock(&khugepaged_mm_lock); clear_bit(MMF_VM_HUGEPAGE, &mm->flags); free_mm_slot(mm_slot); mmdrop(mm); } else if (mm_slot) { - spin_unlock(&khugepaged_mm_lock); /* * This is required to serialize against * khugepaged_test_exit() (which is guaranteed to run @@ -1614,8 +1613,7 @@ void __khugepaged_exit(struct mm_struct *mm) */ down_write(&mm->mmap_sem); up_write(&mm->mmap_sem); - } else - spin_unlock(&khugepaged_mm_lock); + } } static void release_pte_page(struct page *page) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index bfcf153..dae27ba 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -24,7 +24,7 @@ #include <asm/page.h> #include <asm/pgtable.h> -#include <asm/io.h> +#include <linux/io.h> #include <linux/hugetlb.h> #include <linux/node.h> @@ -62,10 +62,10 @@ static DEFINE_SPINLOCK(hugetlb_lock); * must either hold the mmap_sem for write, or the mmap_sem for read and * the hugetlb_instantiation mutex: * - * down_write(&mm->mmap_sem); + * down_write(&mm->mmap_sem); * or - * down_read(&mm->mmap_sem); - * mutex_lock(&hugetlb_instantiation_mutex); + * down_read(&mm->mmap_sem); + * mutex_lock(&hugetlb_instantiation_mutex); */ struct file_region { struct list_head link; @@ -503,9 +503,10 @@ static void update_and_free_page(struct hstate *h, struct page *page) h->nr_huge_pages--; h->nr_huge_pages_node[page_to_nid(page)]--; for (i = 0; i < pages_per_huge_page(h); i++) { - page[i].flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | - 1 << PG_dirty | 1 << PG_active | 1 << PG_reserved | - 1 << PG_private | 1<< PG_writeback); + page[i].flags &= ~(1 << PG_locked | 1 << PG_error | + 1 << PG_referenced | 1 << PG_dirty | + 1 << PG_active | 1 << PG_reserved | + 1 << PG_private | 1 << PG_writeback); } set_compound_page_dtor(page, NULL); set_page_refcounted(page); @@ -591,7 +592,6 @@ int PageHuge(struct page *page) return dtor == free_huge_page; } - EXPORT_SYMBOL_GPL(PageHuge); static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) @@ -1105,8 +1105,16 @@ static void __init gather_bootmem_prealloc(void) struct huge_bootmem_page *m; list_for_each_entry(m, &huge_boot_pages, list) { - struct page *page = virt_to_page(m); struct hstate *h = m->hstate; + struct page *page; + +#ifdef CONFIG_HIGHMEM + page = pfn_to_page(m->phys >> PAGE_SHIFT); + free_bootmem_late((unsigned long)m, + sizeof(struct huge_bootmem_page)); +#else + page = virt_to_page(m); +#endif __ClearPageReserved(page); WARN_ON(page_count(page) != 1); prep_compound_huge_page(page, h->order); @@ -2124,9 +2132,8 @@ static void set_huge_ptep_writable(struct vm_area_struct *vma, pte_t entry; entry = pte_mkwrite(pte_mkdirty(huge_ptep_get(ptep))); - if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) { + if (huge_ptep_set_access_flags(vma, address, ptep, entry, 1)) update_mmu_cache(vma, address, ptep); - } } @@ -2181,9 +2188,9 @@ static int is_hugetlb_entry_migration(pte_t pte) if (huge_pte_none(pte) || pte_present(pte)) return 0; swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_migration_entry(swp)) { + if (non_swap_entry(swp) && is_migration_entry(swp)) return 1; - } else + else return 0; } @@ -2194,9 +2201,9 @@ static int is_hugetlb_entry_hwpoisoned(pte_t pte) if (huge_pte_none(pte) || pte_present(pte)) return 0; swp = pte_to_swp_entry(pte); - if (non_swap_entry(swp) && is_hwpoison_entry(swp)) { + if (non_swap_entry(swp) && is_hwpoison_entry(swp)) return 1; - } else + else return 0; } @@ -2559,7 +2566,7 @@ retry: * So we need to block hugepage fault by PG_hwpoison bit check. */ if (unlikely(PageHWPoison(page))) { - ret = VM_FAULT_HWPOISON | + ret = VM_FAULT_HWPOISON | VM_FAULT_SET_HINDEX(h - hstates); goto backout_unlocked; } @@ -2627,7 +2634,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, migration_entry_wait(mm, (pmd_t *)ptep, address); return 0; } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) - return VM_FAULT_HWPOISON_LARGE | + return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(h - hstates); } diff --git a/mm/memblock.c b/mm/memblock.c index a0562d1..ccbf973 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -758,9 +758,9 @@ void __init memblock_analyze(void) /* Check marker in the unused last array entry */ WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base - != (phys_addr_t)RED_INACTIVE); + != MEMBLOCK_INACTIVE); WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base - != (phys_addr_t)RED_INACTIVE); + != MEMBLOCK_INACTIVE); memblock.memory_size = 0; @@ -786,8 +786,8 @@ void __init memblock_init(void) memblock.reserved.max = INIT_MEMBLOCK_REGIONS; /* Write a marker in the unused last array entry */ - memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; - memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE; + memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; + memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = MEMBLOCK_INACTIVE; /* Create a dummy zero size MEMBLOCK which will get coalesced away later. * This simplifies the memblock_add() code below... diff --git a/mm/memory.c b/mm/memory.c index 9b8a01d..a56e3ba 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1290,13 +1290,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, return addr; } -#ifdef CONFIG_PREEMPT -# define ZAP_BLOCK_SIZE (8 * PAGE_SIZE) -#else -/* No preempt: go for improved straight-line efficiency */ -# define ZAP_BLOCK_SIZE (1024 * PAGE_SIZE) -#endif - /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather @@ -1310,10 +1303,6 @@ static unsigned long unmap_page_range(struct mmu_gather *tlb, * * Unmap all pages in the vma list. * - * We aim to not hold locks for too long (for scheduling latency reasons). - * So zap pages in ZAP_BLOCK_SIZE bytecounts. This means we need to - * return the ending mmu_gather to the caller. - * * Only addresses between `start' and `end' will be unmapped. * * The VMA list must be sorted in ascending virtual address order. @@ -1816,7 +1805,63 @@ next_page: } EXPORT_SYMBOL(__get_user_pages); -/** +/* + * fixup_user_fault() - manually resolve a user page fault + * @tsk: the task_struct to use for page fault accounting, or + * NULL if faults are not to be recorded. + * @mm: mm_struct of target mm + * @address: user address + * @fault_flags:flags to pass down to handle_mm_fault() + * + * This is meant to be called in the specific scenario where for locking reasons + * we try to access user memory in atomic context (within a pagefault_disable() + * section), this returns -EFAULT, and we want to resolve the user fault before + * trying again. + * + * Typically this is meant to be used by the futex code. + * + * The main difference with get_user_pages() is that this function will + * unconditionally call handle_mm_fault() which will in turn perform all the + * necessary SW fixup of the dirty and young bits in the PTE, while + * handle_mm_fault() only guarantees to update these in the struct page. + * + * This is important for some architectures where those bits also gate the + * access permission to the page because they are maintained in software. On + * such architectures, gup() will not be enough to make a subsequent access + * succeed. + * + * This should be called with the mm_sem held for read. + */ +int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, + unsigned long address, unsigned int fault_flags) +{ + struct vm_area_struct *vma; + int ret; + + vma = find_extend_vma(mm, address); + if (!vma || address < vma->vm_start) + return -EFAULT; + + ret = handle_mm_fault(mm, vma, address, fault_flags); + if (ret & VM_FAULT_ERROR) { + if (ret & VM_FAULT_OOM) + return -ENOMEM; + if (ret & (VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE)) + return -EHWPOISON; + if (ret & VM_FAULT_SIGBUS) + return -EFAULT; + BUG(); + } + if (tsk) { + if (ret & VM_FAULT_MAJOR) + tsk->maj_flt++; + else + tsk->min_flt++; + } + return 0; +} + +/* * get_user_pages() - pin user pages in memory * @tsk: the task_struct to use for page fault accounting, or * NULL if faults are not to be recorded. @@ -3104,14 +3149,34 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *page_table; spinlock_t *ptl; struct page *page; + struct page *cow_page; pte_t entry; int anon = 0; - int charged = 0; struct page *dirty_page = NULL; struct vm_fault vmf; int ret; int page_mkwrite = 0; + /* + * If we do COW later, allocate page befor taking lock_page() + * on the file cache page. This will reduce lock holding time. + */ + if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { + + if (unlikely(anon_vma_prepare(vma))) + return VM_FAULT_OOM; + + cow_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); + if (!cow_page) + return VM_FAULT_OOM; + + if (mem_cgroup_newpage_charge(cow_page, mm, GFP_KERNEL)) { + page_cache_release(cow_page); + return VM_FAULT_OOM; + } + } else + cow_page = NULL; + vmf.virtual_address = (void __user *)(address & PAGE_MASK); vmf.pgoff = pgoff; vmf.flags = flags; @@ -3120,12 +3185,13 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = vma->vm_ops->fault(vma, &vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - return ret; + goto uncharge_out; if (unlikely(PageHWPoison(vmf.page))) { if (ret & VM_FAULT_LOCKED) unlock_page(vmf.page); - return VM_FAULT_HWPOISON; + ret = VM_FAULT_HWPOISON; + goto uncharge_out; } /* @@ -3143,23 +3209,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, page = vmf.page; if (flags & FAULT_FLAG_WRITE) { if (!(vma->vm_flags & VM_SHARED)) { + page = cow_page; anon = 1; - if (unlikely(anon_vma_prepare(vma))) { - ret = VM_FAULT_OOM; - goto out; - } - page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, - vma, address); - if (!page) { - ret = VM_FAULT_OOM; - goto out; - } - if (mem_cgroup_newpage_charge(page, mm, GFP_KERNEL)) { - ret = VM_FAULT_OOM; - page_cache_release(page); - goto out; - } - charged = 1; copy_user_highpage(page, vmf.page, address, vma); __SetPageUptodate(page); } else { @@ -3228,8 +3279,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, address, page_table); } else { - if (charged) - mem_cgroup_uncharge_page(page); + if (cow_page) + mem_cgroup_uncharge_page(cow_page); if (anon) page_cache_release(page); else @@ -3238,7 +3289,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, pte_unmap_unlock(page_table, ptl); -out: if (dirty_page) { struct address_space *mapping = page->mapping; @@ -3268,6 +3318,13 @@ out: unwritable_page: page_cache_release(page); return ret; +uncharge_out: + /* fs's fault handler get error */ + if (cow_page) { + mem_cgroup_uncharge_page(cow_page); + page_cache_release(cow_page); + } + return ret; } static int do_linear_fault(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c46887b..6e7d8b2 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -34,6 +34,17 @@ #include "internal.h" +/* + * online_page_callback contains pointer to current page onlining function. + * Initially it is generic_online_page(). If it is required it could be + * changed by calling set_online_page_callback() for callback registration + * and restore_online_page_callback() for generic callback restore. + */ + +static void generic_online_page(struct page *page); + +static online_page_callback_t online_page_callback = generic_online_page; + DEFINE_MUTEX(mem_hotplug_mutex); void lock_memory_hotplug(void) @@ -361,23 +372,74 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, } EXPORT_SYMBOL_GPL(__remove_pages); -void online_page(struct page *page) +int set_online_page_callback(online_page_callback_t callback) +{ + int rc = -EINVAL; + + lock_memory_hotplug(); + + if (online_page_callback == generic_online_page) { + online_page_callback = callback; + rc = 0; + } + + unlock_memory_hotplug(); + + return rc; +} +EXPORT_SYMBOL_GPL(set_online_page_callback); + +int restore_online_page_callback(online_page_callback_t callback) +{ + int rc = -EINVAL; + + lock_memory_hotplug(); + + if (online_page_callback == callback) { + online_page_callback = generic_online_page; + rc = 0; + } + + unlock_memory_hotplug(); + + return rc; +} +EXPORT_SYMBOL_GPL(restore_online_page_callback); + +void __online_page_set_limits(struct page *page) { unsigned long pfn = page_to_pfn(page); - totalram_pages++; if (pfn >= num_physpages) num_physpages = pfn + 1; +} +EXPORT_SYMBOL_GPL(__online_page_set_limits); + +void __online_page_increment_counters(struct page *page) +{ + totalram_pages++; #ifdef CONFIG_HIGHMEM if (PageHighMem(page)) totalhigh_pages++; #endif +} +EXPORT_SYMBOL_GPL(__online_page_increment_counters); +void __online_page_free(struct page *page) +{ ClearPageReserved(page); init_page_count(page); __free_page(page); } +EXPORT_SYMBOL_GPL(__online_page_free); + +static void generic_online_page(struct page *page) +{ + __online_page_set_limits(page); + __online_page_increment_counters(page); + __online_page_free(page); +} static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, void *arg) @@ -388,7 +450,7 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, if (PageReserved(pfn_to_page(start_pfn))) for (i = 0; i < nr_pages; i++) { page = pfn_to_page(start_pfn + i); - online_page(page); + (*online_page_callback)(page); onlined_pages++; } *(unsigned long *)arg = onlined_pages; @@ -122,9 +122,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - unsigned long n; + free = global_page_state(NR_FREE_PAGES); + free += global_page_state(NR_FILE_PAGES); + + /* + * shmem pages shouldn't be counted as free in this + * case, they can't be purged, only swapped out, and + * that won't affect the overall amount of available + * memory in the system. + */ + free -= global_page_state(NR_SHMEM); - free = global_page_state(NR_FILE_PAGES); free += nr_swap_pages; /* @@ -136,34 +144,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) free += global_page_state(NR_SLAB_RECLAIMABLE); /* - * Leave the last 3% for root - */ - if (!cap_sys_admin) - free -= free / 32; - - if (free > pages) - return 0; - - /* - * nr_free_pages() is very expensive on large systems, - * only call if we're about to fail. - */ - n = nr_free_pages(); - - /* * Leave reserved pages. The pages are not for anonymous pages. */ - if (n <= totalreserve_pages) + if (free <= totalreserve_pages) goto error; else - n -= totalreserve_pages; + free -= totalreserve_pages; /* * Leave the last 3% for root */ if (!cap_sys_admin) - n -= n / 32; - free += n; + free -= free / 32; if (free > pages) return 0; @@ -1884,9 +1884,17 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) return 0; if (sysctl_overcommit_memory == OVERCOMMIT_GUESS) { - unsigned long n; + free = global_page_state(NR_FREE_PAGES); + free += global_page_state(NR_FILE_PAGES); + + /* + * shmem pages shouldn't be counted as free in this + * case, they can't be purged, only swapped out, and + * that won't affect the overall amount of available + * memory in the system. + */ + free -= global_page_state(NR_SHMEM); - free = global_page_state(NR_FILE_PAGES); free += nr_swap_pages; /* @@ -1898,34 +1906,18 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) free += global_page_state(NR_SLAB_RECLAIMABLE); /* - * Leave the last 3% for root - */ - if (!cap_sys_admin) - free -= free / 32; - - if (free > pages) - return 0; - - /* - * nr_free_pages() is very expensive on large systems, - * only call if we're about to fail. - */ - n = nr_free_pages(); - - /* * Leave reserved pages. The pages are not for anonymous pages. */ - if (n <= totalreserve_pages) + if (free <= totalreserve_pages) goto error; else - n -= totalreserve_pages; + free -= totalreserve_pages; /* * Leave the last 3% for root */ if (!cap_sys_admin) - n -= n / 32; - free += n; + free -= free / 32; if (free > pages) return 0; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b0be989..eafff89 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -487,7 +487,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order, /* * If any of p's children has a different mm and is eligible for kill, - * the one with the highest badness() score is sacrificed for its + * the one with the highest oom_badness() score is sacrificed for its * parent. This attempts to lose the minimal amount of work done while * still freeing memory. */ diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 31f6988..d8767b3 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1141,7 +1141,6 @@ EXPORT_SYMBOL(account_page_dirtied); void account_page_writeback(struct page *page) { inc_zone_page_state(page, NR_WRITEBACK); - inc_zone_page_state(page, NR_WRITTEN); } EXPORT_SYMBOL(account_page_writeback); @@ -1358,8 +1357,10 @@ int test_clear_page_writeback(struct page *page) } else { ret = TestClearPageWriteback(page); } - if (ret) + if (ret) { dec_zone_page_state(page, NR_WRITEBACK); + inc_zone_page_state(page, NR_WRITTEN); + } return ret; } @@ -1405,10 +1406,6 @@ EXPORT_SYMBOL(test_set_page_writeback); */ int mapping_tagged(struct address_space *mapping, int tag) { - int ret; - rcu_read_lock(); - ret = radix_tree_tagged(&mapping->page_tree, tag); - rcu_read_unlock(); - return ret; + return radix_tree_tagged(&mapping->page_tree, tag); } EXPORT_SYMBOL(mapping_tagged); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 9119faa..0944723 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1616,6 +1616,21 @@ static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) set_bit(i, zlc->fullzones); } +/* + * clear all zones full, called after direct reclaim makes progress so that + * a zone that was recently full is not skipped over for up to a second + */ +static void zlc_clear_zones_full(struct zonelist *zonelist) +{ + struct zonelist_cache *zlc; /* cached zonelist speedup info */ + + zlc = zonelist->zlcache_ptr; + if (!zlc) + return; + + bitmap_zero(zlc->fullzones, MAX_ZONES_PER_ZONELIST); +} + #else /* CONFIG_NUMA */ static nodemask_t *zlc_setup(struct zonelist *zonelist, int alloc_flags) @@ -1632,6 +1647,10 @@ static int zlc_zone_worth_trying(struct zonelist *zonelist, struct zoneref *z, static void zlc_mark_zone_full(struct zonelist *zonelist, struct zoneref *z) { } + +static void zlc_clear_zones_full(struct zonelist *zonelist) +{ +} #endif /* CONFIG_NUMA */ /* @@ -1664,7 +1683,7 @@ zonelist_scan: continue; if ((alloc_flags & ALLOC_CPUSET) && !cpuset_zone_allowed_softwall(zone, gfp_mask)) - goto try_next_zone; + continue; BUILD_BUG_ON(ALLOC_NO_WATERMARKS < NR_WMARK); if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { @@ -1676,17 +1695,36 @@ zonelist_scan: classzone_idx, alloc_flags)) goto try_this_zone; + if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { + /* + * we do zlc_setup if there are multiple nodes + * and before considering the first zone allowed + * by the cpuset. + */ + allowednodes = zlc_setup(zonelist, alloc_flags); + zlc_active = 1; + did_zlc_setup = 1; + } + if (zone_reclaim_mode == 0) goto this_zone_full; + /* + * As we may have just activated ZLC, check if the first + * eligible zone has failed zone_reclaim recently. + */ + if (NUMA_BUILD && zlc_active && + !zlc_zone_worth_trying(zonelist, z, allowednodes)) + continue; + ret = zone_reclaim(zone, gfp_mask, order); switch (ret) { case ZONE_RECLAIM_NOSCAN: /* did not scan */ - goto try_next_zone; + continue; case ZONE_RECLAIM_FULL: /* scanned but unreclaimable */ - goto this_zone_full; + continue; default: /* did we reclaim enough */ if (!zone_watermark_ok(zone, order, mark, @@ -1703,16 +1741,6 @@ try_this_zone: this_zone_full: if (NUMA_BUILD) zlc_mark_zone_full(zonelist, z); -try_next_zone: - if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { - /* - * we do zlc_setup after the first zone is tried but only - * if there are multiple nodes make it worthwhile - */ - allowednodes = zlc_setup(zonelist, alloc_flags); - zlc_active = 1; - did_zlc_setup = 1; - } } if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { @@ -1954,6 +1982,10 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, if (unlikely(!(*did_some_progress))) return NULL; + /* After successful reclaim, reconsider all zones for allocation */ + if (NUMA_BUILD) + zlc_clear_zones_full(zonelist); + retry: page = get_page_from_freelist(gfp_mask, nodemask, order, zonelist, high_zoneidx, diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 53bffc6..39d216d 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -225,8 +225,8 @@ int __meminit online_page_cgroup(unsigned long start_pfn, unsigned long start, end, pfn; int fail = 0; - start = start_pfn & ~(PAGES_PER_SECTION - 1); - end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); + start = SECTION_ALIGN_DOWN(start_pfn); + end = SECTION_ALIGN_UP(start_pfn + nr_pages); if (nid == -1) { /* @@ -258,8 +258,8 @@ int __meminit offline_page_cgroup(unsigned long start_pfn, { unsigned long start, end, pfn; - start = start_pfn & ~(PAGES_PER_SECTION - 1); - end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); + start = SECTION_ALIGN_DOWN(start_pfn); + end = SECTION_ALIGN_UP(start_pfn + nr_pages); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_cgroup(pfn); @@ -537,7 +537,7 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) nomem: printk(KERN_INFO "couldn't allocate enough memory for swap_cgroup.\n"); printk(KERN_INFO - "swap_cgroup can be disabled by noswapaccount boot option\n"); + "swap_cgroup can be disabled by swapaccount=0 boot option\n"); return -ENOMEM; } diff --git a/mm/pagewalk.c b/mm/pagewalk.c index c3450d5..2f5cf10 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -126,7 +126,39 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, return 0; } -#endif + +static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) +{ + struct vm_area_struct *vma; + + /* We don't need vma lookup at all. */ + if (!walk->hugetlb_entry) + return NULL; + + VM_BUG_ON(!rwsem_is_locked(&walk->mm->mmap_sem)); + vma = find_vma(walk->mm, addr); + if (vma && vma->vm_start <= addr && is_vm_hugetlb_page(vma)) + return vma; + + return NULL; +} + +#else /* CONFIG_HUGETLB_PAGE */ +static struct vm_area_struct* hugetlb_vma(unsigned long addr, struct mm_walk *walk) +{ + return NULL; +} + +static int walk_hugetlb_range(struct vm_area_struct *vma, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + return 0; +} + +#endif /* CONFIG_HUGETLB_PAGE */ + + /** * walk_page_range - walk a memory map's page tables with a callback @@ -144,11 +176,15 @@ static int walk_hugetlb_range(struct vm_area_struct *vma, * associated range, and a copy of the original mm_walk for access to * the ->private or ->mm fields. * - * No locks are taken, but the bottom level iterator will map PTE + * Usually no locks are taken, but splitting transparent huge page may + * take page table lock. And the bottom level iterator will map PTE * directories from highmem if necessary. * * If any callback returns a non-zero value, the walk is aborted and * the return value is propagated back to the caller. Otherwise 0 is returned. + * + * walk->mm->mmap_sem must be held for at least read if walk->hugetlb_entry + * is !NULL. */ int walk_page_range(unsigned long addr, unsigned long end, struct mm_walk *walk) @@ -165,18 +201,17 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd = pgd_offset(walk->mm, addr); do { - struct vm_area_struct *uninitialized_var(vma); + struct vm_area_struct *vma; next = pgd_addr_end(addr, end); -#ifdef CONFIG_HUGETLB_PAGE /* * handle hugetlb vma individually because pagetable walk for * the hugetlb page is dependent on the architecture and * we can't handled it in the same manner as non-huge pages. */ - vma = find_vma(walk->mm, addr); - if (vma && is_vm_hugetlb_page(vma)) { + vma = hugetlb_vma(addr, walk); + if (vma) { if (vma->vm_end < next) next = vma->vm_end; /* @@ -189,7 +224,7 @@ int walk_page_range(unsigned long addr, unsigned long end, pgd = pgd_offset(walk->mm, next); continue; } -#endif + if (pgd_none_or_clear_bad(pgd)) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); @@ -51,6 +51,7 @@ static struct vfsmount *shm_mnt; #include <linux/shmem_fs.h> #include <linux/writeback.h> #include <linux/blkdev.h> +#include <linux/splice.h> #include <linux/security.h> #include <linux/swapops.h> #include <linux/mempolicy.h> @@ -126,8 +127,15 @@ static unsigned long shmem_default_max_inodes(void) } #endif -static int shmem_getpage(struct inode *inode, unsigned long idx, - struct page **pagep, enum sgp_type sgp, int *type); +static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type); + +static inline int shmem_getpage(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, int *fault_type) +{ + return shmem_getpage_gfp(inode, index, pagep, sgp, + mapping_gfp_mask(inode->i_mapping), fault_type); +} static inline struct page *shmem_dir_alloc(gfp_t gfp_mask) { @@ -241,9 +249,7 @@ static void shmem_free_blocks(struct inode *inode, long pages) struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) { percpu_counter_add(&sbinfo->used_blocks, -pages); - spin_lock(&inode->i_lock); inode->i_blocks -= pages*BLOCKS_PER_PAGE; - spin_unlock(&inode->i_lock); } } @@ -405,10 +411,12 @@ static void shmem_swp_set(struct shmem_inode_info *info, swp_entry_t *entry, uns * @info: info structure for the inode * @index: index of the page to find * @sgp: check and recheck i_size? skip allocation? + * @gfp: gfp mask to use for any page allocation * * If the entry does not exist, allocate it. */ -static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long index, enum sgp_type sgp) +static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, + unsigned long index, enum sgp_type sgp, gfp_t gfp) { struct inode *inode = &info->vfs_inode; struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb); @@ -432,13 +440,11 @@ static swp_entry_t *shmem_swp_alloc(struct shmem_inode_info *info, unsigned long sbinfo->max_blocks - 1) >= 0) return ERR_PTR(-ENOSPC); percpu_counter_inc(&sbinfo->used_blocks); - spin_lock(&inode->i_lock); inode->i_blocks += BLOCKS_PER_PAGE; - spin_unlock(&inode->i_lock); } spin_unlock(&info->lock); - page = shmem_dir_alloc(mapping_gfp_mask(inode->i_mapping)); + page = shmem_dir_alloc(gfp); spin_lock(&info->lock); if (!page) { @@ -966,20 +972,7 @@ found: error = add_to_page_cache_locked(page, mapping, idx, GFP_NOWAIT); /* which does mem_cgroup_uncharge_cache_page on error */ - if (error == -EEXIST) { - struct page *filepage = find_get_page(mapping, idx); - error = 1; - if (filepage) { - /* - * There might be a more uptodate page coming down - * from a stacked writepage: forget our swappage if so. - */ - if (PageUptodate(filepage)) - error = 0; - page_cache_release(filepage); - } - } - if (!error) { + if (error != -ENOMEM) { delete_from_swap_cache(page); set_page_dirty(page); info->flags |= SHMEM_PAGEIN; @@ -1066,16 +1059,17 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) /* * shmem_backing_dev_info's capabilities prevent regular writeback or * sync from ever calling shmem_writepage; but a stacking filesystem - * may use the ->writepage of its underlying filesystem, in which case + * might use ->writepage of its underlying filesystem, in which case * tmpfs should write out to swap only in response to memory pressure, - * and not for the writeback threads or sync. However, in those cases, - * we do still want to check if there's a redundant swappage to be - * discarded. + * and not for the writeback threads or sync. */ - if (wbc->for_reclaim) - swap = get_swap_page(); - else - swap.val = 0; + if (!wbc->for_reclaim) { + WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ + goto redirty; + } + swap = get_swap_page(); + if (!swap.val) + goto redirty; /* * Add inode to shmem_unuse()'s list of swapped-out inodes, @@ -1086,15 +1080,12 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * we've taken the spinlock, because shmem_unuse_inode() will * prune a !swapped inode from the swaplist under both locks. */ - if (swap.val) { - mutex_lock(&shmem_swaplist_mutex); - if (list_empty(&info->swaplist)) - list_add_tail(&info->swaplist, &shmem_swaplist); - } + mutex_lock(&shmem_swaplist_mutex); + if (list_empty(&info->swaplist)) + list_add_tail(&info->swaplist, &shmem_swaplist); spin_lock(&info->lock); - if (swap.val) - mutex_unlock(&shmem_swaplist_mutex); + mutex_unlock(&shmem_swaplist_mutex); if (index >= info->next_index) { BUG_ON(!(info->flags & SHMEM_TRUNCATE)); @@ -1102,16 +1093,13 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } entry = shmem_swp_entry(info, index, NULL); if (entry->val) { - /* - * The more uptodate page coming down from a stacked - * writepage should replace our old swappage. - */ + WARN_ON_ONCE(1); /* Still happens? Tell us about it! */ free_swap_and_cache(*entry); shmem_swp_set(info, entry, 0); } shmem_recalc_inode(inode); - if (swap.val && add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { + if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { delete_from_page_cache(page); shmem_swp_set(info, entry, swap.val); shmem_swp_unmap(entry); @@ -1228,92 +1216,83 @@ static inline struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) #endif /* - * shmem_getpage - either get the page from swap or allocate a new one + * shmem_getpage_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap * entry since a page cannot live in both the swap and page cache */ -static int shmem_getpage(struct inode *inode, unsigned long idx, - struct page **pagep, enum sgp_type sgp, int *type) +static int shmem_getpage_gfp(struct inode *inode, pgoff_t idx, + struct page **pagep, enum sgp_type sgp, gfp_t gfp, int *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_sb_info *sbinfo; - struct page *filepage = *pagep; - struct page *swappage; + struct page *page; struct page *prealloc_page = NULL; swp_entry_t *entry; swp_entry_t swap; - gfp_t gfp; int error; + int ret; if (idx >= SHMEM_MAX_INDEX) return -EFBIG; - - if (type) - *type = 0; - - /* - * Normally, filepage is NULL on entry, and either found - * uptodate immediately, or allocated and zeroed, or read - * in under swappage, which is then assigned to filepage. - * But shmem_readpage (required for splice) passes in a locked - * filepage, which may be found not uptodate by other callers - * too, and may need to be copied from the swappage read in. - */ repeat: - if (!filepage) - filepage = find_lock_page(mapping, idx); - if (filepage && PageUptodate(filepage)) - goto done; - gfp = mapping_gfp_mask(mapping); - if (!filepage) { + page = find_lock_page(mapping, idx); + if (page) { /* - * Try to preload while we can wait, to not make a habit of - * draining atomic reserves; but don't latch on to this cpu. + * Once we can get the page lock, it must be uptodate: + * if there were an error in reading back from swap, + * the page would not be inserted into the filecache. */ - error = radix_tree_preload(gfp & ~__GFP_HIGHMEM); - if (error) - goto failed; - radix_tree_preload_end(); - if (sgp != SGP_READ && !prealloc_page) { - /* We don't care if this fails */ - prealloc_page = shmem_alloc_page(gfp, info, idx); - if (prealloc_page) { - if (mem_cgroup_cache_charge(prealloc_page, - current->mm, GFP_KERNEL)) { - page_cache_release(prealloc_page); - prealloc_page = NULL; - } + BUG_ON(!PageUptodate(page)); + goto done; + } + + /* + * Try to preload while we can wait, to not make a habit of + * draining atomic reserves; but don't latch on to this cpu. + */ + error = radix_tree_preload(gfp & GFP_RECLAIM_MASK); + if (error) + goto out; + radix_tree_preload_end(); + + if (sgp != SGP_READ && !prealloc_page) { + prealloc_page = shmem_alloc_page(gfp, info, idx); + if (prealloc_page) { + SetPageSwapBacked(prealloc_page); + if (mem_cgroup_cache_charge(prealloc_page, + current->mm, GFP_KERNEL)) { + page_cache_release(prealloc_page); + prealloc_page = NULL; } } } - error = 0; spin_lock(&info->lock); shmem_recalc_inode(inode); - entry = shmem_swp_alloc(info, idx, sgp); + entry = shmem_swp_alloc(info, idx, sgp, gfp); if (IS_ERR(entry)) { spin_unlock(&info->lock); error = PTR_ERR(entry); - goto failed; + goto out; } swap = *entry; if (swap.val) { /* Look it up and read it in.. */ - swappage = lookup_swap_cache(swap); - if (!swappage) { + page = lookup_swap_cache(swap); + if (!page) { shmem_swp_unmap(entry); spin_unlock(&info->lock); /* here we actually do the io */ - if (type) - *type |= VM_FAULT_MAJOR; - swappage = shmem_swapin(swap, gfp, info, idx); - if (!swappage) { + if (fault_type) + *fault_type |= VM_FAULT_MAJOR; + page = shmem_swapin(swap, gfp, info, idx); + if (!page) { spin_lock(&info->lock); - entry = shmem_swp_alloc(info, idx, sgp); + entry = shmem_swp_alloc(info, idx, sgp, gfp); if (IS_ERR(entry)) error = PTR_ERR(entry); else { @@ -1323,62 +1302,42 @@ repeat: } spin_unlock(&info->lock); if (error) - goto failed; + goto out; goto repeat; } - wait_on_page_locked(swappage); - page_cache_release(swappage); + wait_on_page_locked(page); + page_cache_release(page); goto repeat; } /* We have to do this with page locked to prevent races */ - if (!trylock_page(swappage)) { + if (!trylock_page(page)) { shmem_swp_unmap(entry); spin_unlock(&info->lock); - wait_on_page_locked(swappage); - page_cache_release(swappage); + wait_on_page_locked(page); + page_cache_release(page); goto repeat; } - if (PageWriteback(swappage)) { + if (PageWriteback(page)) { shmem_swp_unmap(entry); spin_unlock(&info->lock); - wait_on_page_writeback(swappage); - unlock_page(swappage); - page_cache_release(swappage); + wait_on_page_writeback(page); + unlock_page(page); + page_cache_release(page); goto repeat; } - if (!PageUptodate(swappage)) { + if (!PageUptodate(page)) { shmem_swp_unmap(entry); spin_unlock(&info->lock); - unlock_page(swappage); - page_cache_release(swappage); + unlock_page(page); + page_cache_release(page); error = -EIO; - goto failed; + goto out; } - if (filepage) { - shmem_swp_set(info, entry, 0); - shmem_swp_unmap(entry); - delete_from_swap_cache(swappage); - spin_unlock(&info->lock); - copy_highpage(filepage, swappage); - unlock_page(swappage); - page_cache_release(swappage); - flush_dcache_page(filepage); - SetPageUptodate(filepage); - set_page_dirty(filepage); - swap_free(swap); - } else if (!(error = add_to_page_cache_locked(swappage, mapping, - idx, GFP_NOWAIT))) { - info->flags |= SHMEM_PAGEIN; - shmem_swp_set(info, entry, 0); - shmem_swp_unmap(entry); - delete_from_swap_cache(swappage); - spin_unlock(&info->lock); - filepage = swappage; - set_page_dirty(filepage); - swap_free(swap); - } else { + error = add_to_page_cache_locked(page, mapping, + idx, GFP_NOWAIT); + if (error) { shmem_swp_unmap(entry); spin_unlock(&info->lock); if (error == -ENOMEM) { @@ -1387,32 +1346,38 @@ repeat: * call memcg's OOM if needed. */ error = mem_cgroup_shmem_charge_fallback( - swappage, - current->mm, - gfp); + page, current->mm, gfp); if (error) { - unlock_page(swappage); - page_cache_release(swappage); - goto failed; + unlock_page(page); + page_cache_release(page); + goto out; } } - unlock_page(swappage); - page_cache_release(swappage); + unlock_page(page); + page_cache_release(page); goto repeat; } - } else if (sgp == SGP_READ && !filepage) { + + info->flags |= SHMEM_PAGEIN; + shmem_swp_set(info, entry, 0); shmem_swp_unmap(entry); - filepage = find_get_page(mapping, idx); - if (filepage && - (!PageUptodate(filepage) || !trylock_page(filepage))) { + delete_from_swap_cache(page); + spin_unlock(&info->lock); + set_page_dirty(page); + swap_free(swap); + + } else if (sgp == SGP_READ) { + shmem_swp_unmap(entry); + page = find_get_page(mapping, idx); + if (page && !trylock_page(page)) { spin_unlock(&info->lock); - wait_on_page_locked(filepage); - page_cache_release(filepage); - filepage = NULL; + wait_on_page_locked(page); + page_cache_release(page); goto repeat; } spin_unlock(&info->lock); - } else { + + } else if (prealloc_page) { shmem_swp_unmap(entry); sbinfo = SHMEM_SB(inode->i_sb); if (sbinfo->max_blocks) { @@ -1421,126 +1386,86 @@ repeat: shmem_acct_block(info->flags)) goto nospace; percpu_counter_inc(&sbinfo->used_blocks); - spin_lock(&inode->i_lock); inode->i_blocks += BLOCKS_PER_PAGE; - spin_unlock(&inode->i_lock); } else if (shmem_acct_block(info->flags)) goto nospace; - if (!filepage) { - int ret; - - if (!prealloc_page) { - spin_unlock(&info->lock); - filepage = shmem_alloc_page(gfp, info, idx); - if (!filepage) { - shmem_unacct_blocks(info->flags, 1); - shmem_free_blocks(inode, 1); - error = -ENOMEM; - goto failed; - } - SetPageSwapBacked(filepage); + page = prealloc_page; + prealloc_page = NULL; - /* - * Precharge page while we can wait, compensate - * after - */ - error = mem_cgroup_cache_charge(filepage, - current->mm, GFP_KERNEL); - if (error) { - page_cache_release(filepage); - shmem_unacct_blocks(info->flags, 1); - shmem_free_blocks(inode, 1); - filepage = NULL; - goto failed; - } - - spin_lock(&info->lock); - } else { - filepage = prealloc_page; - prealloc_page = NULL; - SetPageSwapBacked(filepage); - } - - entry = shmem_swp_alloc(info, idx, sgp); - if (IS_ERR(entry)) - error = PTR_ERR(entry); - else { - swap = *entry; - shmem_swp_unmap(entry); - } - ret = error || swap.val; - if (ret) - mem_cgroup_uncharge_cache_page(filepage); - else - ret = add_to_page_cache_lru(filepage, mapping, + entry = shmem_swp_alloc(info, idx, sgp, gfp); + if (IS_ERR(entry)) + error = PTR_ERR(entry); + else { + swap = *entry; + shmem_swp_unmap(entry); + } + ret = error || swap.val; + if (ret) + mem_cgroup_uncharge_cache_page(page); + else + ret = add_to_page_cache_lru(page, mapping, idx, GFP_NOWAIT); - /* - * At add_to_page_cache_lru() failure, uncharge will - * be done automatically. - */ - if (ret) { - spin_unlock(&info->lock); - page_cache_release(filepage); - shmem_unacct_blocks(info->flags, 1); - shmem_free_blocks(inode, 1); - filepage = NULL; - if (error) - goto failed; - goto repeat; - } - info->flags |= SHMEM_PAGEIN; + /* + * At add_to_page_cache_lru() failure, + * uncharge will be done automatically. + */ + if (ret) { + shmem_unacct_blocks(info->flags, 1); + shmem_free_blocks(inode, 1); + spin_unlock(&info->lock); + page_cache_release(page); + if (error) + goto out; + goto repeat; } + info->flags |= SHMEM_PAGEIN; info->alloced++; spin_unlock(&info->lock); - clear_highpage(filepage); - flush_dcache_page(filepage); - SetPageUptodate(filepage); + clear_highpage(page); + flush_dcache_page(page); + SetPageUptodate(page); if (sgp == SGP_DIRTY) - set_page_dirty(filepage); + set_page_dirty(page); + + } else { + spin_unlock(&info->lock); + error = -ENOMEM; + goto out; } done: - *pagep = filepage; + *pagep = page; error = 0; - goto out; +out: + if (prealloc_page) { + mem_cgroup_uncharge_cache_page(prealloc_page); + page_cache_release(prealloc_page); + } + return error; nospace: /* * Perhaps the page was brought in from swap between find_lock_page * and taking info->lock? We allow for that at add_to_page_cache_lru, * but must also avoid reporting a spurious ENOSPC while working on a - * full tmpfs. (When filepage has been passed in to shmem_getpage, it - * is already in page cache, which prevents this race from occurring.) + * full tmpfs. */ - if (!filepage) { - struct page *page = find_get_page(mapping, idx); - if (page) { - spin_unlock(&info->lock); - page_cache_release(page); - goto repeat; - } - } + page = find_get_page(mapping, idx); spin_unlock(&info->lock); - error = -ENOSPC; -failed: - if (*pagep != filepage) { - unlock_page(filepage); - page_cache_release(filepage); - } -out: - if (prealloc_page) { - mem_cgroup_uncharge_cache_page(prealloc_page); - page_cache_release(prealloc_page); + if (page) { + page_cache_release(page); + goto repeat; } - return error; + error = -ENOSPC; + goto out; } static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct inode *inode = vma->vm_file->f_path.dentry->d_inode; int error; - int ret; + int ret = VM_FAULT_LOCKED; if (((loff_t)vmf->pgoff << PAGE_CACHE_SHIFT) >= i_size_read(inode)) return VM_FAULT_SIGBUS; @@ -1548,11 +1473,12 @@ static int shmem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) error = shmem_getpage(inode, vmf->pgoff, &vmf->page, SGP_CACHE, &ret); if (error) return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); + if (ret & VM_FAULT_MAJOR) { count_vm_event(PGMAJFAULT); mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT); } - return ret | VM_FAULT_LOCKED; + return ret; } #ifdef CONFIG_NUMA @@ -1669,19 +1595,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_symlink_inline_operations; -/* - * Normally tmpfs avoids the use of shmem_readpage and shmem_write_begin; - * but providing them allows a tmpfs file to be used for splice, sendfile, and - * below the loop driver, in the generic fashion that many filesystems support. - */ -static int shmem_readpage(struct file *file, struct page *page) -{ - struct inode *inode = page->mapping->host; - int error = shmem_getpage(inode, page->index, &page, SGP_CACHE, NULL); - unlock_page(page); - return error; -} - static int shmem_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, @@ -1689,7 +1602,6 @@ shmem_write_begin(struct file *file, struct address_space *mapping, { struct inode *inode = mapping->host; pgoff_t index = pos >> PAGE_CACHE_SHIFT; - *pagep = NULL; return shmem_getpage(inode, index, pagep, SGP_WRITE, NULL); } @@ -1846,6 +1758,119 @@ static ssize_t shmem_file_aio_read(struct kiocb *iocb, return retval; } +static ssize_t shmem_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct address_space *mapping = in->f_mapping; + struct inode *inode = mapping->host; + unsigned int loff, nr_pages, req_pages; + struct page *pages[PIPE_DEF_BUFFERS]; + struct partial_page partial[PIPE_DEF_BUFFERS]; + struct page *page; + pgoff_t index, end_index; + loff_t isize, left; + int error, page_nr; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &page_cache_pipe_buf_ops, + .spd_release = spd_release_page, + }; + + isize = i_size_read(inode); + if (unlikely(*ppos >= isize)) + return 0; + + left = isize - *ppos; + if (unlikely(left < len)) + len = left; + + if (splice_grow_spd(pipe, &spd)) + return -ENOMEM; + + index = *ppos >> PAGE_CACHE_SHIFT; + loff = *ppos & ~PAGE_CACHE_MASK; + req_pages = (len + loff + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + nr_pages = min(req_pages, pipe->buffers); + + spd.nr_pages = find_get_pages_contig(mapping, index, + nr_pages, spd.pages); + index += spd.nr_pages; + error = 0; + + while (spd.nr_pages < nr_pages) { + error = shmem_getpage(inode, index, &page, SGP_CACHE, NULL); + if (error) + break; + unlock_page(page); + spd.pages[spd.nr_pages++] = page; + index++; + } + + index = *ppos >> PAGE_CACHE_SHIFT; + nr_pages = spd.nr_pages; + spd.nr_pages = 0; + + for (page_nr = 0; page_nr < nr_pages; page_nr++) { + unsigned int this_len; + + if (!len) + break; + + this_len = min_t(unsigned long, len, PAGE_CACHE_SIZE - loff); + page = spd.pages[page_nr]; + + if (!PageUptodate(page) || page->mapping != mapping) { + error = shmem_getpage(inode, index, &page, + SGP_CACHE, NULL); + if (error) + break; + unlock_page(page); + page_cache_release(spd.pages[page_nr]); + spd.pages[page_nr] = page; + } + + isize = i_size_read(inode); + end_index = (isize - 1) >> PAGE_CACHE_SHIFT; + if (unlikely(!isize || index > end_index)) + break; + + if (end_index == index) { + unsigned int plen; + + plen = ((isize - 1) & ~PAGE_CACHE_MASK) + 1; + if (plen <= loff) + break; + + this_len = min(this_len, plen - loff); + len = this_len; + } + + spd.partial[page_nr].offset = loff; + spd.partial[page_nr].len = this_len; + len -= this_len; + loff = 0; + spd.nr_pages++; + index++; + } + + while (page_nr < nr_pages) + page_cache_release(spd.pages[page_nr++]); + + if (spd.nr_pages) + error = splice_to_pipe(pipe, &spd); + + splice_shrink_spd(pipe, &spd); + + if (error > 0) { + *ppos += error; + file_accessed(in); + } + return error; +} + static int shmem_statfs(struct dentry *dentry, struct kstatfs *buf) { struct shmem_sb_info *sbinfo = SHMEM_SB(dentry->d_sb); @@ -2006,7 +2031,7 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s int error; int len; struct inode *inode; - struct page *page = NULL; + struct page *page; char *kaddr; struct shmem_inode_info *info; @@ -2684,7 +2709,6 @@ static const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .set_page_dirty = __set_page_dirty_no_writeback, #ifdef CONFIG_TMPFS - .readpage = shmem_readpage, .write_begin = shmem_write_begin, .write_end = shmem_write_end, #endif @@ -2701,7 +2725,7 @@ static const struct file_operations shmem_file_operations = { .aio_read = shmem_file_aio_read, .aio_write = generic_file_aio_write, .fsync = noop_fsync, - .splice_read = generic_file_splice_read, + .splice_read = shmem_file_splice_read, .splice_write = generic_file_splice_write, #endif }; @@ -3042,13 +3066,29 @@ int shmem_zero_setup(struct vm_area_struct *vma) * suit tmpfs, since it may have pages in swapcache, and needs to find those * for itself; although drivers/gpu/drm i915 and ttm rely upon this support. * - * Provide a stub for those callers to start using now, then later - * flesh it out to call shmem_getpage() with additional gfp mask, when - * shmem_file_splice_read() is added and shmem_readpage() is removed. + * i915_gem_object_get_pages_gtt() mixes __GFP_NORETRY | __GFP_NOWARN in + * with the mapping_gfp_mask(), to avoid OOMing the machine unnecessarily. */ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, pgoff_t index, gfp_t gfp) { +#ifdef CONFIG_SHMEM + struct inode *inode = mapping->host; + struct page *page; + int error; + + BUG_ON(mapping->a_ops != &shmem_aops); + error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL); + if (error) + page = ERR_PTR(error); + else + unlock_page(page); + return page; +#else + /* + * The tiny !SHMEM case uses ramfs without swap + */ return read_cache_page_gfp(mapping, index, gfp); +#endif } EXPORT_SYMBOL_GPL(shmem_read_mapping_page_gfp); diff --git a/mm/sparse.c b/mm/sparse.c index aa64b12..858e1df 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -40,7 +40,7 @@ static u8 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; static u16 section_to_node_table[NR_MEM_SECTIONS] __cacheline_aligned; #endif -int page_to_nid(struct page *page) +int page_to_nid(const struct page *page) { return section_to_node_table[page_to_section(page)]; } diff --git a/mm/thrash.c b/mm/thrash.c index fabf2d0..e53f7d0 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -6,7 +6,7 @@ * Released under the GPL, see the file COPYING for details. * * Simple token based thrashing protection, using the algorithm - * described in: http://www.cs.wm.edu/~sjiang/token.pdf + * described in: http://www.cse.ohio-state.edu/hpcs/WWW/HTML/publications/abs05-1.html * * Sep 2006, Ashwin Chaugule <ashwin.chaugule@celunite.com> * Improved algorithm to pass token: @@ -30,8 +30,6 @@ static DEFINE_SPINLOCK(swap_token_lock); struct mm_struct *swap_token_mm; struct mem_cgroup *swap_token_memcg; -static unsigned int global_faults; -static unsigned int last_aging; #ifdef CONFIG_CGROUP_MEM_RES_CTLR static struct mem_cgroup *swap_token_memcg_from_mm(struct mm_struct *mm) @@ -55,6 +53,8 @@ void grab_swap_token(struct mm_struct *mm) { int current_interval; unsigned int old_prio = mm->token_priority; + static unsigned int global_faults; + static unsigned int last_aging; global_faults++; @@ -67,6 +67,17 @@ void grab_swap_token(struct mm_struct *mm) if (!swap_token_mm) goto replace_token; + /* + * Usually, we don't need priority aging because long interval faults + * makes priority decrease quickly. But there is one exception. If the + * token owner task is sleeping, it never make long interval faults. + * Thus, we need a priority aging mechanism instead. The requirements + * of priority aging are + * 1) An aging interval is reasonable enough long. Too short aging + * interval makes quick swap token lost and decrease performance. + * 2) The swap token owner task have to get priority aging even if + * it's under sleep. + */ if ((global_faults - last_aging) > TOKEN_AGING_INTERVAL) { swap_token_mm->token_priority /= 2; last_aging = global_faults; diff --git a/mm/truncate.c b/mm/truncate.c index 003c6c6..232eb27 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -199,9 +199,6 @@ int invalidate_inode_page(struct page *page) * The first pass will remove most pages, so the search cost of the second pass * is low. * - * When looking at page->index outside the page lock we need to be careful to - * copy it into a local to avoid races (it could change at any time). - * * We pass down the cache-hot hint to the page freeing code. Even if the * mapping is large, it is probably the case that the final pages are the most * recently touched, and freeing happens in ascending file offset order. @@ -210,10 +207,10 @@ void truncate_inode_pages_range(struct address_space *mapping, loff_t lstart, loff_t lend) { const pgoff_t start = (lstart + PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT; - pgoff_t end; const unsigned partial = lstart & (PAGE_CACHE_SIZE - 1); struct pagevec pvec; - pgoff_t next; + pgoff_t index; + pgoff_t end; int i; cleancache_flush_inode(mapping); @@ -224,24 +221,21 @@ void truncate_inode_pages_range(struct address_space *mapping, end = (lend >> PAGE_CACHE_SHIFT); pagevec_init(&pvec, 0); - next = start; - while (next <= end && - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + index = start; + while (index <= end && pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t page_index = page->index; - if (page_index > end) { - next = page_index; + /* We rely upon deletion not changing page->index */ + index = page->index; + if (index > end) break; - } - if (page_index > next) - next = page_index; - next++; if (!trylock_page(page)) continue; + WARN_ON(page->index != index); if (PageWriteback(page)) { unlock_page(page); continue; @@ -252,6 +246,7 @@ void truncate_inode_pages_range(struct address_space *mapping, pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); + index++; } if (partial) { @@ -264,16 +259,17 @@ void truncate_inode_pages_range(struct address_space *mapping, } } - next = start; + index = start; for ( ; ; ) { cond_resched(); - if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { - if (next == start) + if (!pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + if (index == start) break; - next = start; + index = start; continue; } - if (pvec.pages[0]->index > end) { + if (index == start && pvec.pages[0]->index > end) { pagevec_release(&pvec); break; } @@ -281,18 +277,20 @@ void truncate_inode_pages_range(struct address_space *mapping, for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - if (page->index > end) + /* We rely upon deletion not changing page->index */ + index = page->index; + if (index > end) break; + lock_page(page); + WARN_ON(page->index != index); wait_on_page_writeback(page); truncate_inode_page(mapping, page); - if (page->index > next) - next = page->index; - next++; unlock_page(page); } pagevec_release(&pvec); mem_cgroup_uncharge_end(); + index++; } cleancache_flush_inode(mapping); } @@ -333,35 +331,26 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct pagevec pvec; - pgoff_t next = start; + pgoff_t index = start; unsigned long ret; unsigned long count = 0; int i; pagevec_init(&pvec, 0); - while (next <= end && - pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + while (index <= end && pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t index; - int lock_failed; - - lock_failed = !trylock_page(page); - /* - * We really shouldn't be looking at the ->index of an - * unlocked page. But we're not allowed to lock these - * pages. So we rely upon nobody altering the ->index - * of this (pinned-by-us) page. - */ + /* We rely upon deletion not changing page->index */ index = page->index; - if (index > next) - next = index; - next++; - if (lock_failed) - continue; + if (index > end) + break; + if (!trylock_page(page)) + continue; + WARN_ON(page->index != index); ret = invalidate_inode_page(page); unlock_page(page); /* @@ -371,12 +360,11 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping, if (!ret) deactivate_page(page); count += ret; - if (next > end) - break; } pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); + index++; } return count; } @@ -442,37 +430,32 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct pagevec pvec; - pgoff_t next; + pgoff_t index; int i; int ret = 0; int ret2 = 0; int did_range_unmap = 0; - int wrapped = 0; cleancache_flush_inode(mapping); pagevec_init(&pvec, 0); - next = start; - while (next <= end && !wrapped && - pagevec_lookup(&pvec, mapping, next, - min(end - next, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { + index = start; + while (index <= end && pagevec_lookup(&pvec, mapping, index, + min(end - index, (pgoff_t)PAGEVEC_SIZE - 1) + 1)) { mem_cgroup_uncharge_start(); for (i = 0; i < pagevec_count(&pvec); i++) { struct page *page = pvec.pages[i]; - pgoff_t page_index; + + /* We rely upon deletion not changing page->index */ + index = page->index; + if (index > end) + break; lock_page(page); + WARN_ON(page->index != index); if (page->mapping != mapping) { unlock_page(page); continue; } - page_index = page->index; - next = page_index + 1; - if (next == 0) - wrapped = 1; - if (page_index > end) { - unlock_page(page); - break; - } wait_on_page_writeback(page); if (page_mapped(page)) { if (!did_range_unmap) { @@ -480,9 +463,9 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * Zap the rest of the file in one hit. */ unmap_mapping_range(mapping, - (loff_t)page_index<<PAGE_CACHE_SHIFT, - (loff_t)(end - page_index + 1) - << PAGE_CACHE_SHIFT, + (loff_t)index << PAGE_CACHE_SHIFT, + (loff_t)(1 + end - index) + << PAGE_CACHE_SHIFT, 0); did_range_unmap = 1; } else { @@ -490,8 +473,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * Just zap this page */ unmap_mapping_range(mapping, - (loff_t)page_index<<PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE, 0); + (loff_t)index << PAGE_CACHE_SHIFT, + PAGE_CACHE_SIZE, 0); } } BUG_ON(page_mapped(page)); @@ -507,6 +490,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, pagevec_release(&pvec); mem_cgroup_uncharge_end(); cond_resched(); + index++; } cleancache_flush_inode(mapping); return ret; @@ -531,8 +515,8 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); /** * truncate_pagecache - unmap and remove pagecache that has been truncated * @inode: inode - * @old: old file offset - * @new: new file offset + * @oldsize: old file size + * @newsize: new file size * * inode's new i_size must already be written before truncate_pagecache * is called. @@ -544,9 +528,10 @@ EXPORT_SYMBOL_GPL(invalidate_inode_pages2); * situations such as writepage being called for a page that has already * had its underlying blocks deallocated. */ -void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) +void truncate_pagecache(struct inode *inode, loff_t oldsize, loff_t newsize) { struct address_space *mapping = inode->i_mapping; + loff_t holebegin = round_up(newsize, PAGE_SIZE); /* * unmap_mapping_range is called twice, first simply for @@ -557,9 +542,9 @@ void truncate_pagecache(struct inode *inode, loff_t old, loff_t new) * truncate_inode_pages finishes, hence the second * unmap_mapping_range call must be made for correctness. */ - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); - truncate_inode_pages(mapping, new); - unmap_mapping_range(mapping, new + PAGE_SIZE - 1, 0, 1); + unmap_mapping_range(mapping, holebegin, 0, 1); + truncate_inode_pages(mapping, newsize); + unmap_mapping_range(mapping, holebegin, 0, 1); } EXPORT_SYMBOL(truncate_pagecache); @@ -589,29 +574,31 @@ EXPORT_SYMBOL(truncate_setsize); /** * vmtruncate - unmap mappings "freed" by truncate() syscall * @inode: inode of the file used - * @offset: file offset to start truncating + * @newsize: file offset to start truncating * * This function is deprecated and truncate_setsize or truncate_pagecache * should be used instead, together with filesystem specific block truncation. */ -int vmtruncate(struct inode *inode, loff_t offset) +int vmtruncate(struct inode *inode, loff_t newsize) { int error; - error = inode_newsize_ok(inode, offset); + error = inode_newsize_ok(inode, newsize); if (error) return error; - truncate_setsize(inode, offset); + truncate_setsize(inode, newsize); if (inode->i_op->truncate) inode->i_op->truncate(inode); return 0; } EXPORT_SYMBOL(vmtruncate); -int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) +int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) { struct address_space *mapping = inode->i_mapping; + loff_t holebegin = round_up(lstart, PAGE_SIZE); + loff_t holelen = 1 + lend - holebegin; /* * If the underlying filesystem is not going to provide @@ -623,10 +610,10 @@ int vmtruncate_range(struct inode *inode, loff_t offset, loff_t end) mutex_lock(&inode->i_mutex); inode_dio_wait(inode); - unmap_mapping_range(mapping, offset, (end - offset), 1); - inode->i_op->truncate_range(inode, offset, end); + unmap_mapping_range(mapping, holebegin, holelen, 1); + inode->i_op->truncate_range(inode, lstart, lend); /* unmap again to remove racily COWed private pages */ - unmap_mapping_range(mapping, offset, (end - offset), 1); + unmap_mapping_range(mapping, holebegin, holelen, 1); mutex_unlock(&inode->i_mutex); return 0; |