diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 6 | ||||
-rw-r--r-- | mm/fremap.c | 28 | ||||
-rw-r--r-- | mm/hugetlb.c | 6 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 213 | ||||
-rw-r--r-- | mm/mempolicy.c | 12 | ||||
-rw-r--r-- | mm/mmap.c | 11 | ||||
-rw-r--r-- | mm/mprotect.c | 8 | ||||
-rw-r--r-- | mm/msync.c | 12 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 75 | ||||
-rw-r--r-- | mm/rmap.c | 58 | ||||
-rw-r--r-- | mm/swap.c | 3 | ||||
-rw-r--r-- | mm/thrash.c | 10 | ||||
-rw-r--r-- | mm/truncate.c | 6 | ||||
-rw-r--r-- | mm/vmscan.c | 29 |
16 files changed, 273 insertions, 208 deletions
@@ -125,12 +125,10 @@ comment "Memory hotplug is currently incompatible with Software Suspend" # space can be handled with less contention: split it at this NR_CPUS. # Default to 4 for wider testing, though 8 might be more appropriate. # ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock. -# PA-RISC's debug spinlock_t is too large for the 32-bit struct page. -# ARM26 and SPARC32 and PPC64 may use one page for multiple page tables. +# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes. # config SPLIT_PTLOCK_CPUS int default "4096" if ARM && !CPU_CACHE_VIPT - default "4096" if PARISC && DEBUG_SPINLOCK && !64BIT - default "4096" if ARM26 || SPARC32 || PPC64 + default "4096" if PARISC && !PA20 default "4" diff --git a/mm/fremap.c b/mm/fremap.c index d862be3..f851775 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -27,24 +27,20 @@ static int zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page = NULL; if (pte_present(pte)) { - unsigned long pfn = pte_pfn(pte); - flush_cache_page(vma, addr, pfn); + flush_cache_page(vma, addr, pte_pfn(pte)); pte = ptep_clear_flush(vma, addr, ptep); - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, pte, addr); - goto out; + page = vm_normal_page(vma, addr, pte); + if (page) { + if (pte_dirty(pte)) + set_page_dirty(page); + page_remove_rmap(page); + page_cache_release(page); } - page = pfn_to_page(pfn); - if (pte_dirty(pte)) - set_page_dirty(page); - page_remove_rmap(page); - page_cache_release(page); } else { if (!pte_file(pte)) free_swap_and_cache(pte_to_swp_entry(pte)); pte_clear(mm, addr, ptep); } -out: return !!page; } @@ -65,8 +61,6 @@ int install_page(struct mm_struct *mm, struct vm_area_struct *vma, pte_t pte_val; spinlock_t *ptl; - BUG_ON(vma->vm_flags & VM_RESERVED); - pgd = pgd_offset(mm, addr); pud = pud_alloc(mm, pgd, addr); if (!pud) @@ -122,8 +116,6 @@ int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, pte_t pte_val; spinlock_t *ptl; - BUG_ON(vma->vm_flags & VM_RESERVED); - pgd = pgd_offset(mm, addr); pud = pud_alloc(mm, pgd, addr); if (!pud) @@ -204,12 +196,10 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, * Make sure the vma is shared, that it supports prefaulting, * and that the remapped range is valid and fully within * the single existing vma. vm_private_data is used as a - * swapout cursor in a VM_NONLINEAR vma (unless VM_RESERVED - * or VM_LOCKED, but VM_LOCKED could be revoked later on). + * swapout cursor in a VM_NONLINEAR vma. */ if (vma && (vma->vm_flags & VM_SHARED) && - (!vma->vm_private_data || - (vma->vm_flags & (VM_NONLINEAR|VM_RESERVED))) && + (!vma->vm_private_data || (vma->vm_flags & VM_NONLINEAR)) && vma->vm_ops && vma->vm_ops->populate && end > start && start >= vma->vm_start && end <= vma->vm_end) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 728e9bd..3e52df7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -22,6 +22,10 @@ unsigned long max_huge_pages; static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; static unsigned int free_huge_pages_node[MAX_NUMNODES]; + +/* + * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages + */ static DEFINE_SPINLOCK(hugetlb_lock); static void enqueue_huge_page(struct page *page) @@ -61,8 +65,10 @@ static struct page *alloc_fresh_huge_page(void) HUGETLB_PAGE_ORDER); nid = (nid + 1) % num_online_nodes(); if (page) { + spin_lock(&hugetlb_lock); nr_huge_pages++; nr_huge_pages_node[page_to_nid(page)]++; + spin_unlock(&hugetlb_lock); } return page; } diff --git a/mm/madvise.c b/mm/madvise.c index 17aaf3e..2b7cf04 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -126,7 +126,7 @@ static long madvise_dontneed(struct vm_area_struct * vma, unsigned long start, unsigned long end) { *prev = vma; - if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_RESERVED)) + if (vma->vm_flags & (VM_LOCKED|VM_HUGETLB|VM_PFNMAP)) return -EINVAL; if (unlikely(vma->vm_flags & VM_NONLINEAR)) { diff --git a/mm/memory.c b/mm/memory.c index 2998cfc..9ab206b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -333,9 +333,9 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss) } /* - * This function is called to print an error when a pte in a - * !VM_RESERVED region is found pointing to an invalid pfn (which - * is an error. + * This function is called to print an error when a bad pte + * is found. For example, we might have a PFN-mapped pte in + * a region that doesn't allow it. * * The calling function must still handle the error. */ @@ -350,6 +350,59 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr) } /* + * This function gets the "struct page" associated with a pte. + * + * NOTE! Some mappings do not have "struct pages". A raw PFN mapping + * will have each page table entry just pointing to a raw page frame + * number, and as far as the VM layer is concerned, those do not have + * pages associated with them - even if the PFN might point to memory + * that otherwise is perfectly fine and has a "struct page". + * + * The way we recognize those mappings is through the rules set up + * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set, + * and the vm_pgoff will point to the first PFN mapped: thus every + * page that is a raw mapping will always honor the rule + * + * pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT) + * + * and if that isn't true, the page has been COW'ed (in which case it + * _does_ have a "struct page" associated with it even if it is in a + * VM_PFNMAP range). + */ +struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte) +{ + unsigned long pfn = pte_pfn(pte); + + if (vma->vm_flags & VM_PFNMAP) { + unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT; + if (pfn == vma->vm_pgoff + off) + return NULL; + } + + /* + * Add some anal sanity checks for now. Eventually, + * we should just do "return pfn_to_page(pfn)", but + * in the meantime we check that we get a valid pfn, + * and that the resulting page looks ok. + * + * Remove this test eventually! + */ + if (unlikely(!pfn_valid(pfn))) { + print_bad_pte(vma, pte, addr); + return NULL; + } + + /* + * NOTE! We still have PageReserved() pages in the page + * tables. + * + * The PAGE_ZERO() pages and various VDSO mappings can + * cause them to exist. + */ + return pfn_to_page(pfn); +} + +/* * copy one vm_area from one task to the other. Assumes the page tables * already present in the new task to be cleared in the whole range * covered by this vma. @@ -363,7 +416,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, unsigned long vm_flags = vma->vm_flags; pte_t pte = *src_pte; struct page *page; - unsigned long pfn; /* pte contains position in swap or file, so copy. */ if (unlikely(!pte_present(pte))) { @@ -381,23 +433,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, goto out_set_pte; } - /* If the region is VM_RESERVED, the mapping is not - * mapped via rmap - duplicate the pte as is. - */ - if (vm_flags & VM_RESERVED) - goto out_set_pte; - - pfn = pte_pfn(pte); - /* If the pte points outside of valid memory but - * the region is not VM_RESERVED, we have a problem. - */ - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, pte, addr); - goto out_set_pte; /* try to do something sane */ - } - - page = pfn_to_page(pfn); - /* * If it's a COW mapping, write protect it both * in the parent and the child @@ -414,9 +449,13 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm, if (vm_flags & VM_SHARED) pte = pte_mkclean(pte); pte = pte_mkold(pte); - get_page(page); - page_dup_rmap(page); - rss[!!PageAnon(page)]++; + + page = vm_normal_page(vma, addr, pte); + if (page) { + get_page(page); + page_dup_rmap(page); + rss[!!PageAnon(page)]++; + } out_set_pte: set_pte_at(dst_mm, addr, dst_pte, pte); @@ -528,7 +567,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm, * readonly mappings. The tradeoff is that copy_page_range is more * efficient than faulting. */ - if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_RESERVED))) { + if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP))) { if (!vma->anon_vma) return 0; } @@ -568,17 +607,11 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, continue; } if (pte_present(ptent)) { - struct page *page = NULL; + struct page *page; (*zap_work) -= PAGE_SIZE; - if (!(vma->vm_flags & VM_RESERVED)) { - unsigned long pfn = pte_pfn(ptent); - if (unlikely(!pfn_valid(pfn))) - print_bad_pte(vma, ptent, addr); - else - page = pfn_to_page(pfn); - } + page = vm_normal_page(vma, addr, ptent); if (unlikely(details) && page) { /* * unmap_shared_mapping_pages() wants to @@ -834,7 +867,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, /* * Do a quick page-table lookup for a single page. */ -struct page *follow_page(struct mm_struct *mm, unsigned long address, +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { pgd_t *pgd; @@ -842,8 +875,8 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address, pmd_t *pmd; pte_t *ptep, pte; spinlock_t *ptl; - unsigned long pfn; struct page *page; + struct mm_struct *mm = vma->vm_mm; page = follow_huge_addr(mm, address, flags & FOLL_WRITE); if (!IS_ERR(page)) { @@ -879,11 +912,10 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address, goto unlock; if ((flags & FOLL_WRITE) && !pte_write(pte)) goto unlock; - pfn = pte_pfn(pte); - if (!pfn_valid(pfn)) + page = vm_normal_page(vma, address, pte); + if (unlikely(!page)) goto unlock; - page = pfn_to_page(pfn); if (flags & FOLL_GET) get_page(page); if (flags & FOLL_TOUCH) { @@ -956,8 +988,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, return i ? : -EFAULT; } if (pages) { - pages[i] = pte_page(*pte); - get_page(pages[i]); + struct page *page = vm_normal_page(vma, start, *pte); + pages[i] = page; + if (page) + get_page(page); } pte_unmap(pte); if (vmas) @@ -968,7 +1002,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } - if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED)) + if (!vma || (vma->vm_flags & VM_IO) || !(vm_flags & vma->vm_flags)) return i ? : -EFAULT; @@ -992,7 +1026,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, foll_flags |= FOLL_WRITE; cond_resched(); - while (!(page = follow_page(mm, start, foll_flags))) { + while (!(page = follow_page(vma, start, foll_flags))) { int ret; ret = __handle_mm_fault(mm, vma, start, foll_flags & FOLL_WRITE); @@ -1191,10 +1225,17 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, * rest of the world about it: * VM_IO tells people not to look at these pages * (accesses can have side effects). - * VM_RESERVED tells the core MM not to "manage" these pages - * (e.g. refcount, mapcount, try to swap them out). + * VM_RESERVED is specified all over the place, because + * in 2.4 it kept swapout's vma scan off this vma; but + * in 2.6 the LRU scan won't even find its pages, so this + * flag means no more than count its pages in reserved_vm, + * and omit it from core dump, even when VM_IO turned off. + * VM_PFNMAP tells the core MM that the base pages are just + * raw PFN mappings, and do not have a "struct page" associated + * with them. */ - vma->vm_flags |= VM_IO | VM_RESERVED; + vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; + vma->vm_pgoff = pfn; BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; @@ -1249,6 +1290,26 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } +static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va) +{ + /* + * If the source page was a PFN mapping, we don't have + * a "struct page" for it. We do a best-effort copy by + * just copying from the original user address. If that + * fails, we just zero-fill it. Live with it. + */ + if (unlikely(!src)) { + void *kaddr = kmap_atomic(dst, KM_USER0); + unsigned long left = __copy_from_user_inatomic(kaddr, (void __user *)va, PAGE_SIZE); + if (left) + memset(kaddr, 0, PAGE_SIZE); + kunmap_atomic(kaddr, KM_USER0); + return; + + } + copy_user_highpage(dst, src, va); +} + /* * This routine handles present pages, when users try to write * to a shared page. It is done by copying the page to a new address @@ -1271,22 +1332,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *page_table, pmd_t *pmd, spinlock_t *ptl, pte_t orig_pte) { - struct page *old_page, *new_page; - unsigned long pfn = pte_pfn(orig_pte); + struct page *old_page, *src_page, *new_page; pte_t entry; int ret = VM_FAULT_MINOR; - BUG_ON(vma->vm_flags & VM_RESERVED); - - if (unlikely(!pfn_valid(pfn))) { - /* - * Page table corrupted: show pte and kill process. - */ - print_bad_pte(vma, orig_pte, address); - ret = VM_FAULT_OOM; - goto unlock; - } - old_page = pfn_to_page(pfn); + old_page = vm_normal_page(vma, address, orig_pte); + src_page = old_page; + if (!old_page) + goto gotten; if (PageAnon(old_page) && !TestSetPageLocked(old_page)) { int reuse = can_share_swap_page(old_page); @@ -1307,11 +1360,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, * Ok, we need to copy. Oh, well.. */ page_cache_get(old_page); +gotten: pte_unmap_unlock(page_table, ptl); if (unlikely(anon_vma_prepare(vma))) goto oom; - if (old_page == ZERO_PAGE(address)) { + if (src_page == ZERO_PAGE(address)) { new_page = alloc_zeroed_user_highpage(vma, address); if (!new_page) goto oom; @@ -1319,7 +1373,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, new_page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!new_page) goto oom; - copy_user_highpage(new_page, old_page, address); + cow_user_page(new_page, src_page, address); } /* @@ -1327,11 +1381,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, */ page_table = pte_offset_map_lock(mm, pmd, address, &ptl); if (likely(pte_same(*page_table, orig_pte))) { - page_remove_rmap(old_page); - if (!PageAnon(old_page)) { + if (old_page) { + page_remove_rmap(old_page); + if (!PageAnon(old_page)) { + dec_mm_counter(mm, file_rss); + inc_mm_counter(mm, anon_rss); + } + } else inc_mm_counter(mm, anon_rss); - dec_mm_counter(mm, file_rss); - } flush_cache_page(vma, address, pfn); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); @@ -1345,13 +1402,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, new_page = old_page; ret |= VM_FAULT_WRITE; } - page_cache_release(new_page); - page_cache_release(old_page); + if (new_page) + page_cache_release(new_page); + if (old_page) + page_cache_release(old_page); unlock: pte_unmap_unlock(page_table, ptl); return ret; oom: - page_cache_release(old_page); + if (old_page) + page_cache_release(old_page); return VM_FAULT_OOM; } @@ -1849,7 +1909,6 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma, int anon = 0; pte_unmap(page_table); - if (vma->vm_file) { mapping = vma->vm_file->f_mapping; sequence = mapping->truncate_count; @@ -1882,7 +1941,7 @@ retry: page = alloc_page_vma(GFP_HIGHUSER, vma, address); if (!page) goto oom; - copy_user_highpage(page, new_page, address); + cow_user_page(page, new_page, address); page_cache_release(new_page); new_page = page; anon = 1; @@ -1924,7 +1983,7 @@ retry: inc_mm_counter(mm, anon_rss); lru_cache_add_active(new_page); page_add_anon_rmap(new_page, vma, address); - } else if (!(vma->vm_flags & VM_RESERVED)) { + } else { inc_mm_counter(mm, file_rss); page_add_file_rmap(new_page); } @@ -2101,6 +2160,12 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) spin_unlock(&mm->page_table_lock); return 0; } +#else +/* Workaround for gcc 2.96 */ +int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address) +{ + return 0; +} #endif /* __PAGETABLE_PUD_FOLDED */ #ifndef __PAGETABLE_PMD_FOLDED @@ -2129,6 +2194,12 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) spin_unlock(&mm->page_table_lock); return 0; } +#else +/* Workaround for gcc 2.96 */ +int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) +{ + return 0; +} #endif /* __PAGETABLE_PMD_FOLDED */ int make_pages_present(unsigned long addr, unsigned long end) @@ -2203,7 +2274,7 @@ static int __init gate_vma_init(void) gate_vma.vm_start = FIXADDR_USER_START; gate_vma.vm_end = FIXADDR_USER_END; gate_vma.vm_page_prot = PAGE_READONLY; - gate_vma.vm_flags = VM_RESERVED; + gate_vma.vm_flags = 0; return 0; } __initcall(gate_vma_init); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 5abc57c..bec88c8 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -189,17 +189,15 @@ static int check_pte_range(struct vm_area_struct *vma, pmd_t *pmd, orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { - unsigned long pfn; + struct page *page; unsigned int nid; if (!pte_present(*pte)) continue; - pfn = pte_pfn(*pte); - if (!pfn_valid(pfn)) { - print_bad_pte(vma, *pte, addr); + page = vm_normal_page(vma, addr, *pte); + if (!page) continue; - } - nid = pfn_to_nid(pfn); + nid = page_to_nid(page); if (!node_isset(nid, *nodes)) break; } while (pte++, addr += PAGE_SIZE, addr != end); @@ -269,8 +267,6 @@ check_range(struct mm_struct *mm, unsigned long start, unsigned long end, first = find_vma(mm, start); if (!first) return ERR_PTR(-EFAULT); - if (first->vm_flags & VM_RESERVED) - return ERR_PTR(-EACCES); prev = NULL; for (vma = first; vma && vma->vm_start < end; vma = vma->vm_next) { if (!vma->vm_next && vma->vm_end < end) @@ -1076,17 +1076,6 @@ munmap_back: error = file->f_op->mmap(file, vma); if (error) goto unmap_and_free_vma; - if ((vma->vm_flags & (VM_SHARED | VM_WRITE | VM_RESERVED)) - == (VM_WRITE | VM_RESERVED)) { - printk(KERN_WARNING "program %s is using MAP_PRIVATE, " - "PROT_WRITE mmap of VM_RESERVED memory, which " - "is deprecated. Please report this to " - "linux-kernel@vger.kernel.org\n",current->comm); - if (vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); - error = -EACCES; - goto unmap_and_free_vma; - } } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); if (error) diff --git a/mm/mprotect.c b/mm/mprotect.c index 17a2b52..653b857 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -124,14 +124,6 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, * a MAP_NORESERVE private mapping to writable will now reserve. */ if (newflags & VM_WRITE) { - if (oldflags & VM_RESERVED) { - BUG_ON(oldflags & VM_WRITE); - printk(KERN_WARNING "program %s is using MAP_PRIVATE, " - "PROT_WRITE mprotect of VM_RESERVED memory, " - "which is deprecated. Please report this to " - "linux-kernel@vger.kernel.org\n",current->comm); - return -EACCES; - } if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_SHARED|VM_HUGETLB))) { charged = nrpages; if (security_vm_enough_memory(charged)) @@ -27,7 +27,6 @@ static void msync_pte_range(struct vm_area_struct *vma, pmd_t *pmd, again: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); do { - unsigned long pfn; struct page *page; if (progress >= 64) { @@ -40,13 +39,9 @@ again: continue; if (!pte_maybe_dirty(*pte)) continue; - pfn = pte_pfn(*pte); - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, *pte, addr); + page = vm_normal_page(vma, addr, *pte); + if (!page) continue; - } - page = pfn_to_page(pfn); - if (ptep_clear_flush_dirty(vma, addr, pte) || page_test_and_clear_dirty(page)) set_page_dirty(page); @@ -97,9 +92,8 @@ static void msync_page_range(struct vm_area_struct *vma, /* For hugepages we can't go walking the page table normally, * but that's ok, hugetlbfs is memory based, so we don't need * to do anything more on an msync(). - * Can't do anything with VM_RESERVED regions either. */ - if (vma->vm_flags & (VM_HUGETLB|VM_RESERVED)) + if (vma->vm_flags & VM_HUGETLB) return; BUG_ON(addr >= end); @@ -1045,7 +1045,7 @@ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) EXPORT_SYMBOL(find_vma); -struct page *follow_page(struct mm_struct *mm, unsigned long address, +struct page *follow_page(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags) { return NULL; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index bd4de59..b257720 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -140,18 +140,13 @@ static void bad_page(const char *function, struct page *page) 1 << PG_reclaim | 1 << PG_slab | 1 << PG_swapcache | - 1 << PG_writeback | - 1 << PG_reserved ); + 1 << PG_writeback ); set_page_count(page, 0); reset_page_mapcount(page); page->mapping = NULL; add_taint(TAINT_BAD_PAGE); } -#ifndef CONFIG_HUGETLB_PAGE -#define prep_compound_page(page, order) do { } while (0) -#define destroy_compound_page(page, order) do { } while (0) -#else /* * Higher-order pages are called "compound pages". They are structured thusly: * @@ -205,7 +200,6 @@ static void destroy_compound_page(struct page *page, unsigned long order) ClearPageCompound(p); } } -#endif /* CONFIG_HUGETLB_PAGE */ /* * function for dealing with page's order in buddy system. @@ -340,7 +334,7 @@ static inline void __free_pages_bulk (struct page *page, zone->free_area[order].nr_free++; } -static inline void free_pages_check(const char *function, struct page *page) +static inline int free_pages_check(const char *function, struct page *page) { if ( page_mapcount(page) || page->mapping != NULL || @@ -358,6 +352,12 @@ static inline void free_pages_check(const char *function, struct page *page) bad_page(function, page); if (PageDirty(page)) __ClearPageDirty(page); + /* + * For now, we report if PG_reserved was found set, but do not + * clear it, and do not free the page. But we shall soon need + * to do more, for when the ZERO_PAGE count wraps negative. + */ + return PageReserved(page); } /* @@ -397,11 +397,10 @@ void __free_pages_ok(struct page *page, unsigned int order) { LIST_HEAD(list); int i; + int reserved = 0; arch_free_page(page, order); - mod_page_state(pgfree, 1 << order); - #ifndef CONFIG_MMU if (order > 0) for (i = 1 ; i < (1 << order) ; ++i) @@ -409,8 +408,12 @@ void __free_pages_ok(struct page *page, unsigned int order) #endif for (i = 0 ; i < (1 << order) ; ++i) - free_pages_check(__FUNCTION__, page + i); + reserved += free_pages_check(__FUNCTION__, page + i); + if (reserved) + return; + list_add(&page->lru, &list); + mod_page_state(pgfree, 1 << order); kernel_map_pages(page, 1<<order, 0); free_pages_bulk(page_zone(page), 1, &list, order); } @@ -468,7 +471,7 @@ void set_page_refs(struct page *page, int order) /* * This page is about to be returned from the page allocator */ -static void prep_new_page(struct page *page, int order) +static int prep_new_page(struct page *page, int order) { if ( page_mapcount(page) || page->mapping != NULL || @@ -486,12 +489,20 @@ static void prep_new_page(struct page *page, int order) 1 << PG_reserved ))) bad_page(__FUNCTION__, page); + /* + * For now, we report if PG_reserved was found set, but do not + * clear it, and do not allocate the page: as a safety net. + */ + if (PageReserved(page)) + return 1; + page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_referenced | 1 << PG_arch_1 | 1 << PG_checked | 1 << PG_mappedtodisk); set_page_private(page, 0); set_page_refs(page, order); kernel_map_pages(page, 1 << order, 1); + return 0; } /* @@ -674,11 +685,14 @@ static void fastcall free_hot_cold_page(struct page *page, int cold) arch_free_page(page, 0); - kernel_map_pages(page, 1, 0); - inc_page_state(pgfree); if (PageAnon(page)) page->mapping = NULL; - free_pages_check(__FUNCTION__, page); + if (free_pages_check(__FUNCTION__, page)) + return; + + inc_page_state(pgfree); + kernel_map_pages(page, 1, 0); + pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); list_add(&page->lru, &pcp->list); @@ -717,12 +731,14 @@ static struct page * buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) { unsigned long flags; - struct page *page = NULL; + struct page *page; int cold = !!(gfp_flags & __GFP_COLD); +again: if (order == 0) { struct per_cpu_pages *pcp; + page = NULL; pcp = &zone_pcp(zone, get_cpu())->pcp[cold]; local_irq_save(flags); if (pcp->count <= pcp->low) @@ -744,7 +760,8 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) if (page != NULL) { BUG_ON(bad_range(zone, page)); mod_page_state_zone(zone, pgalloc, 1 << order); - prep_new_page(page, order); + if (prep_new_page(page, order)) + goto again; if (gfp_flags & __GFP_ZERO) prep_zero_page(page, order, gfp_flags); @@ -756,9 +773,12 @@ buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags) } #define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */ -#define ALLOC_HARDER 0x02 /* try to alloc harder */ -#define ALLOC_HIGH 0x04 /* __GFP_HIGH set */ -#define ALLOC_CPUSET 0x08 /* check for correct cpuset */ +#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */ +#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */ +#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */ +#define ALLOC_HARDER 0x10 /* try to alloc harder */ +#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ +#define ALLOC_CPUSET 0x40 /* check for correct cpuset */ /* * Return 1 if free pages are above 'mark'. This takes into account the order @@ -813,7 +833,14 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, continue; if (!(alloc_flags & ALLOC_NO_WATERMARKS)) { - if (!zone_watermark_ok(*z, order, (*z)->pages_low, + unsigned long mark; + if (alloc_flags & ALLOC_WMARK_MIN) + mark = (*z)->pages_min; + else if (alloc_flags & ALLOC_WMARK_LOW) + mark = (*z)->pages_low; + else + mark = (*z)->pages_high; + if (!zone_watermark_ok(*z, order, mark, classzone_idx, alloc_flags)) continue; } @@ -854,7 +881,7 @@ restart: } page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, - zonelist, ALLOC_CPUSET); + zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); if (page) goto got_pg; @@ -871,7 +898,7 @@ restart: * cannot run direct reclaim, or if the caller has realtime scheduling * policy. */ - alloc_flags = 0; + alloc_flags = ALLOC_WMARK_MIN; if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait) alloc_flags |= ALLOC_HARDER; if (gfp_mask & __GFP_HIGH) @@ -942,7 +969,7 @@ rebalance: * under heavy pressure. */ page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, - zonelist, ALLOC_CPUSET); + zonelist, ALLOC_WMARK_HIGH|ALLOC_CPUSET); if (page) goto got_pg; @@ -225,7 +225,7 @@ vma_address(struct page *page, struct vm_area_struct *vma) /* * At what user virtual address is page expected in vma? checking that the - * page matches the vma: currently only used by unuse_process, on anon pages. + * page matches the vma: currently only used on anon pages, by unuse_vma; */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { @@ -234,7 +234,8 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) (void *)page->mapping - PAGE_MAPPING_ANON) return -EFAULT; } else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) { - if (vma->vm_file->f_mapping != page->mapping) + if (!vma->vm_file || + vma->vm_file->f_mapping != page->mapping) return -EFAULT; } else return -EFAULT; @@ -289,7 +290,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, * repeatedly from either page_referenced_anon or page_referenced_file. */ static int page_referenced_one(struct page *page, - struct vm_area_struct *vma, unsigned int *mapcount, int ignore_token) + struct vm_area_struct *vma, unsigned int *mapcount) { struct mm_struct *mm = vma->vm_mm; unsigned long address; @@ -310,7 +311,7 @@ static int page_referenced_one(struct page *page, /* Pretend the page is referenced if the task has the swap token and is in the middle of a page fault. */ - if (mm != current->mm && !ignore_token && has_swap_token(mm) && + if (mm != current->mm && has_swap_token(mm) && rwsem_is_locked(&mm->mmap_sem)) referenced++; @@ -320,7 +321,7 @@ out: return referenced; } -static int page_referenced_anon(struct page *page, int ignore_token) +static int page_referenced_anon(struct page *page) { unsigned int mapcount; struct anon_vma *anon_vma; @@ -333,8 +334,7 @@ static int page_referenced_anon(struct page *page, int ignore_token) mapcount = page_mapcount(page); list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - referenced += page_referenced_one(page, vma, &mapcount, - ignore_token); + referenced += page_referenced_one(page, vma, &mapcount); if (!mapcount) break; } @@ -353,7 +353,7 @@ static int page_referenced_anon(struct page *page, int ignore_token) * * This function is only called from page_referenced for object-based pages. */ -static int page_referenced_file(struct page *page, int ignore_token) +static int page_referenced_file(struct page *page) { unsigned int mapcount; struct address_space *mapping = page->mapping; @@ -391,8 +391,7 @@ static int page_referenced_file(struct page *page, int ignore_token) referenced++; break; } - referenced += page_referenced_one(page, vma, &mapcount, - ignore_token); + referenced += page_referenced_one(page, vma, &mapcount); if (!mapcount) break; } @@ -409,13 +408,10 @@ static int page_referenced_file(struct page *page, int ignore_token) * Quick test_and_clear_referenced for all mappings to a page, * returns the number of ptes which referenced the page. */ -int page_referenced(struct page *page, int is_locked, int ignore_token) +int page_referenced(struct page *page, int is_locked) { int referenced = 0; - if (!swap_token_default_timeout) - ignore_token = 1; - if (page_test_and_clear_young(page)) referenced++; @@ -424,15 +420,14 @@ int page_referenced(struct page *page, int is_locked, int ignore_token) if (page_mapped(page) && page->mapping) { if (PageAnon(page)) - referenced += page_referenced_anon(page, ignore_token); + referenced += page_referenced_anon(page); else if (is_locked) - referenced += page_referenced_file(page, ignore_token); + referenced += page_referenced_file(page); else if (TestSetPageLocked(page)) referenced++; else { if (page->mapping) - referenced += page_referenced_file(page, - ignore_token); + referenced += page_referenced_file(page); unlock_page(page); } } @@ -529,10 +524,8 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma) * If the page is mlock()d, we cannot swap it out. * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. - * - * Pages belonging to VM_RESERVED regions should not happen here. */ - if ((vma->vm_flags & (VM_LOCKED|VM_RESERVED)) || + if ((vma->vm_flags & VM_LOCKED) || ptep_clear_flush_young(vma, address, pte)) { ret = SWAP_FAIL; goto out_unmap; @@ -613,7 +606,6 @@ static void try_to_unmap_cluster(unsigned long cursor, struct page *page; unsigned long address; unsigned long end; - unsigned long pfn; address = (vma->vm_start + cursor) & CLUSTER_MASK; end = address + CLUSTER_SIZE; @@ -642,15 +634,8 @@ static void try_to_unmap_cluster(unsigned long cursor, for (; address < end; pte++, address += PAGE_SIZE) { if (!pte_present(*pte)) continue; - - pfn = pte_pfn(*pte); - if (unlikely(!pfn_valid(pfn))) { - print_bad_pte(vma, *pte, address); - continue; - } - - page = pfn_to_page(pfn); - BUG_ON(PageAnon(page)); + page = vm_normal_page(vma, address, *pte); + BUG_ON(!page || PageAnon(page)); if (ptep_clear_flush_young(vma, address, pte)) continue; @@ -727,7 +712,7 @@ static int try_to_unmap_file(struct page *page) list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) + if (vma->vm_flags & VM_LOCKED) continue; cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) @@ -761,7 +746,7 @@ static int try_to_unmap_file(struct page *page) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if (vma->vm_flags & (VM_LOCKED|VM_RESERVED)) + if (vma->vm_flags & VM_LOCKED) continue; cursor = (unsigned long) vma->vm_private_data; while ( cursor < max_nl_cursor && @@ -783,11 +768,8 @@ static int try_to_unmap_file(struct page *page) * in locked vmas). Reset cursor on all unreserved nonlinear * vmas, now forgetting on which ones it had fallen behind. */ - list_for_each_entry(vma, &mapping->i_mmap_nonlinear, - shared.vm_set.list) { - if (!(vma->vm_flags & VM_RESERVED)) - vma->vm_private_data = NULL; - } + list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) + vma->vm_private_data = NULL; out: spin_unlock(&mapping->i_mmap_lock); return ret; @@ -34,8 +34,6 @@ /* How many pages do we try to swap or page in/out together? */ int page_cluster; -#ifdef CONFIG_HUGETLB_PAGE - void put_page(struct page *page) { if (unlikely(PageCompound(page))) { @@ -52,7 +50,6 @@ void put_page(struct page *page) __page_cache_release(page); } EXPORT_SYMBOL(put_page); -#endif /* * Writeback is about to end against a page which has been marked for immediate diff --git a/mm/thrash.c b/mm/thrash.c index eff3c18..f4c560b 100644 --- a/mm/thrash.c +++ b/mm/thrash.c @@ -57,14 +57,17 @@ void grab_swap_token(void) /* We have the token. Let others know we still need it. */ if (has_swap_token(current->mm)) { current->mm->recent_pagein = 1; + if (unlikely(!swap_token_default_timeout)) + disable_swap_token(); return; } if (time_after(jiffies, swap_token_check)) { - /* Can't get swapout protection if we exceed our RSS limit. */ - // if (current->mm->rss > current->mm->rlimit_rss) - // return; + if (!swap_token_default_timeout) { + swap_token_check = jiffies + SWAP_TOKEN_CHECK_INTERVAL; + return; + } /* ... or if we recently held the token. */ if (time_before(jiffies, current->mm->swap_token_time)) @@ -95,6 +98,7 @@ void __put_swap_token(struct mm_struct *mm) { spin_lock(&swap_token_lock); if (likely(mm == swap_token_mm)) { + mm->swap_token_time = jiffies + SWAP_TOKEN_CHECK_INTERVAL; swap_token_mm = &init_mm; swap_token_check = jiffies; } diff --git a/mm/truncate.c b/mm/truncate.c index 29c18f6..9173ab50 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -282,8 +282,8 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * Zap the rest of the file in one hit. */ unmap_mapping_range(mapping, - page_index << PAGE_CACHE_SHIFT, - (end - page_index + 1) + (loff_t)page_index<<PAGE_CACHE_SHIFT, + (loff_t)(end - page_index + 1) << PAGE_CACHE_SHIFT, 0); did_range_unmap = 1; @@ -292,7 +292,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * Just zap this page */ unmap_mapping_range(mapping, - page_index << PAGE_CACHE_SHIFT, + (loff_t)page_index<<PAGE_CACHE_SHIFT, PAGE_CACHE_SIZE, 0); } } diff --git a/mm/vmscan.c b/mm/vmscan.c index 2813054..b0cd81c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -201,13 +201,25 @@ static int shrink_slab(unsigned long scanned, gfp_t gfp_mask, list_for_each_entry(shrinker, &shrinker_list, list) { unsigned long long delta; unsigned long total_scan; + unsigned long max_pass = (*shrinker->shrinker)(0, gfp_mask); delta = (4 * scanned) / shrinker->seeks; - delta *= (*shrinker->shrinker)(0, gfp_mask); + delta *= max_pass; do_div(delta, lru_pages + 1); shrinker->nr += delta; - if (shrinker->nr < 0) - shrinker->nr = LONG_MAX; /* It wrapped! */ + if (shrinker->nr < 0) { + printk(KERN_ERR "%s: nr=%ld\n", + __FUNCTION__, shrinker->nr); + shrinker->nr = max_pass; + } + + /* + * Avoid risking looping forever due to too large nr value: + * never try to free more than twice the estimate number of + * freeable entries. + */ + if (shrinker->nr > max_pass * 2) + shrinker->nr = max_pass * 2; total_scan = shrinker->nr; shrinker->nr = 0; @@ -407,7 +419,7 @@ static int shrink_list(struct list_head *page_list, struct scan_control *sc) if (PageWriteback(page)) goto keep_locked; - referenced = page_referenced(page, 1, sc->priority <= 0); + referenced = page_referenced(page, 1); /* In active use or really unfreeable? Activate it. */ if (referenced && page_mapping_inuse(page)) goto activate_locked; @@ -756,7 +768,7 @@ refill_inactive_zone(struct zone *zone, struct scan_control *sc) if (page_mapped(page)) { if (!reclaim_mapped || (total_swap_pages == 0 && PageAnon(page)) || - page_referenced(page, 0, sc->priority <= 0)) { + page_referenced(page, 0)) { list_add(&page->lru, &l_active); continue; } @@ -960,6 +972,8 @@ int try_to_free_pages(struct zone **zones, gfp_t gfp_mask) sc.nr_reclaimed = 0; sc.priority = priority; sc.swap_cluster_max = SWAP_CLUSTER_MAX; + if (!priority) + disable_swap_token(); shrink_caches(zones, &sc); shrink_slab(sc.nr_scanned, gfp_mask, lru_pages); if (reclaim_state) { @@ -1056,6 +1070,10 @@ loop_again: int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ unsigned long lru_pages = 0; + /* The swap token gets in the way of swapout... */ + if (!priority) + disable_swap_token(); + all_zones_ok = 1; if (nr_pages == 0) { @@ -1360,6 +1378,7 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) sc.nr_reclaimed = 0; /* scan at the highest priority */ sc.priority = 0; + disable_swap_token(); if (nr_pages > SWAP_CLUSTER_MAX) sc.swap_cluster_max = nr_pages; |