diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 18 | ||||
-rw-r--r-- | mm/Makefile | 3 | ||||
-rw-r--r-- | mm/bootmem.c | 2 | ||||
-rw-r--r-- | mm/fadvise.c | 2 | ||||
-rw-r--r-- | mm/filemap.c | 55 | ||||
-rw-r--r-- | mm/fremap.c | 27 | ||||
-rw-r--r-- | mm/hugetlb.c | 67 | ||||
-rw-r--r-- | mm/internal.h | 131 | ||||
-rw-r--r-- | mm/memcontrol.c | 466 | ||||
-rw-r--r-- | mm/memory.c | 127 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 19 | ||||
-rw-r--r-- | mm/mempolicy.c | 11 | ||||
-rw-r--r-- | mm/migrate.c | 274 | ||||
-rw-r--r-- | mm/mlock.c | 443 | ||||
-rw-r--r-- | mm/mmap.c | 81 | ||||
-rw-r--r-- | mm/mremap.c | 8 | ||||
-rw-r--r-- | mm/nommu.c | 44 | ||||
-rw-r--r-- | mm/page-writeback.c | 22 | ||||
-rw-r--r-- | mm/page_alloc.c | 123 | ||||
-rw-r--r-- | mm/page_cgroup.c | 237 | ||||
-rw-r--r-- | mm/pdflush.c | 2 | ||||
-rw-r--r-- | mm/readahead.c | 4 | ||||
-rw-r--r-- | mm/rmap.c | 319 | ||||
-rw-r--r-- | mm/shmem.c | 12 | ||||
-rw-r--r-- | mm/swap.c | 172 | ||||
-rw-r--r-- | mm/swap_state.c | 11 | ||||
-rw-r--r-- | mm/swapfile.c | 27 | ||||
-rw-r--r-- | mm/truncate.c | 6 | ||||
-rw-r--r-- | mm/vmalloc.c | 975 | ||||
-rw-r--r-- | mm/vmscan.c | 1026 | ||||
-rw-r--r-- | mm/vmstat.c | 33 |
31 files changed, 3629 insertions, 1118 deletions
@@ -101,7 +101,7 @@ config HAVE_MEMORY_PRESENT # with gcc 3.4 and later. # config SPARSEMEM_STATIC - def_bool n + bool # # Architecture platforms which require a two level mem_section in SPARSEMEM @@ -113,7 +113,7 @@ config SPARSEMEM_EXTREME depends on SPARSEMEM && !SPARSEMEM_STATIC config SPARSEMEM_VMEMMAP_ENABLE - def_bool n + bool config SPARSEMEM_VMEMMAP bool "Sparse Memory virtual memmap" @@ -187,6 +187,9 @@ config RESOURCES_64BIT help This option allows memory and IO resources to be 64 bit. +config PHYS_ADDR_T_64BIT + def_bool 64BIT || ARCH_PHYS_ADDR_T_64BIT + config ZONE_DMA_FLAG int default "0" if !ZONE_DMA @@ -206,5 +209,16 @@ config VIRT_TO_BUS def_bool y depends on !ARCH_NO_VIRT_TO_BUS +config UNEVICTABLE_LRU + bool "Add LRU list to track non-evictable pages" + default y + depends on MMU + help + Keeps unevictable pages off of the active and inactive pageout + lists, so kswapd will not waste CPU time or have its balancing + algorithms thrown off by scanning these pages. Selecting this + will use one page flag and increase the code size a little, + say Y unless you know what you are doing. + config MMU_NOTIFIER bool diff --git a/mm/Makefile b/mm/Makefile index da4ccf0..c06b45a 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -33,5 +33,4 @@ obj-$(CONFIG_FS_XIP) += filemap_xip.o obj-$(CONFIG_MIGRATION) += migrate.o obj-$(CONFIG_SMP) += allocpercpu.o obj-$(CONFIG_QUICKLIST) += quicklist.o -obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o - +obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o diff --git a/mm/bootmem.c b/mm/bootmem.c index ad8eec6..ac5a891 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -48,7 +48,7 @@ early_param("bootmem_debug", bootmem_debug_setup); if (unlikely(bootmem_debug)) \ printk(KERN_INFO \ "bootmem::%s " fmt, \ - __FUNCTION__, ## args); \ + __func__, ## args); \ }) static unsigned long __init bootmap_bytes(unsigned long pages) diff --git a/mm/fadvise.c b/mm/fadvise.c index 343cfdf..a1da969 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -3,7 +3,7 @@ * * Copyright (C) 2002, Linus Torvalds * - * 11Jan2003 akpm@digeo.com + * 11Jan2003 Andrew Morton * Initial version. */ diff --git a/mm/filemap.c b/mm/filemap.c index 876bc59..ab85536 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -33,6 +33,7 @@ #include <linux/cpuset.h> #include <linux/hardirq.h> /* for BUG_ON(!in_atomic()) only */ #include <linux/memcontrol.h> +#include <linux/mm_inline.h> /* for page_is_file_cache() */ #include "internal.h" /* @@ -115,12 +116,12 @@ void __remove_from_page_cache(struct page *page) { struct address_space *mapping = page->mapping; - mem_cgroup_uncharge_cache_page(page); radix_tree_delete(&mapping->page_tree, page->index); page->mapping = NULL; mapping->nrpages--; __dec_zone_page_state(page, NR_FILE_PAGES); BUG_ON(page_mapped(page)); + mem_cgroup_uncharge_cache_page(page); /* * Some filesystems seem to re-dirty the page even after @@ -492,9 +493,24 @@ EXPORT_SYMBOL(add_to_page_cache_locked); int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t offset, gfp_t gfp_mask) { - int ret = add_to_page_cache(page, mapping, offset, gfp_mask); - if (ret == 0) - lru_cache_add(page); + int ret; + + /* + * Splice_read and readahead add shmem/tmpfs pages into the page cache + * before shmem_readpage has a chance to mark them as SwapBacked: they + * need to go on the active_anon lru below, and mem_cgroup_cache_charge + * (called in add_to_page_cache) needs to know where they're going too. + */ + if (mapping_cap_swap_backed(mapping)) + SetPageSwapBacked(page); + + ret = add_to_page_cache(page, mapping, offset, gfp_mask); + if (ret == 0) { + if (page_is_file_cache(page)) + lru_cache_add_file(page); + else + lru_cache_add_active_anon(page); + } return ret; } @@ -557,17 +573,14 @@ EXPORT_SYMBOL(wait_on_page_bit); * mechananism between PageLocked pages and PageWriteback pages is shared. * But that's OK - sleepers in wait_on_page_writeback() just go back to sleep. * - * The first mb is necessary to safely close the critical section opened by the - * test_and_set_bit() to lock the page; the second mb is necessary to enforce - * ordering between the clear_bit and the read of the waitqueue (to avoid SMP - * races with a parallel wait_on_page_locked()). + * The mb is necessary to enforce ordering between the clear_bit and the read + * of the waitqueue (to avoid SMP races with a parallel wait_on_page_locked()). */ void unlock_page(struct page *page) { - smp_mb__before_clear_bit(); - if (!test_and_clear_bit(PG_locked, &page->flags)) - BUG(); - smp_mb__after_clear_bit(); + VM_BUG_ON(!PageLocked(page)); + clear_bit_unlock(PG_locked, &page->flags); + smp_mb__after_clear_bit(); wake_up_page(page, PG_locked); } EXPORT_SYMBOL(unlock_page); @@ -1100,8 +1113,9 @@ page_ok: page_not_up_to_date: /* Get exclusive access to the page ... */ - if (lock_page_killable(page)) - goto readpage_eio; + error = lock_page_killable(page); + if (unlikely(error)) + goto readpage_error; page_not_up_to_date_locked: /* Did it get truncated before we got the lock? */ @@ -1130,8 +1144,9 @@ readpage: } if (!PageUptodate(page)) { - if (lock_page_killable(page)) - goto readpage_eio; + error = lock_page_killable(page); + if (unlikely(error)) + goto readpage_error; if (!PageUptodate(page)) { if (page->mapping == NULL) { /* @@ -1143,15 +1158,14 @@ readpage: } unlock_page(page); shrink_readahead_size_eio(filp, ra); - goto readpage_eio; + error = -EIO; + goto readpage_error; } unlock_page(page); } goto page_ok; -readpage_eio: - error = -EIO; readpage_error: /* UHHUH! A synchronous read error occurred. Report it */ desc->error = error; @@ -1186,8 +1200,7 @@ out: ra->prev_pos |= prev_offset; *ppos = ((loff_t)index << PAGE_CACHE_SHIFT) + offset; - if (filp) - file_accessed(filp); + file_accessed(filp); } int file_read_actor(read_descriptor_t *desc, struct page *page, diff --git a/mm/fremap.c b/mm/fremap.c index 7881638..7d12ca7 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -21,6 +21,8 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include "internal.h" + static void zap_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pte_t *ptep) { @@ -215,15 +217,31 @@ asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, spin_unlock(&mapping->i_mmap_lock); } + if (vma->vm_flags & VM_LOCKED) { + /* + * drop PG_Mlocked flag for over-mapped range + */ + unsigned int saved_flags = vma->vm_flags; + munlock_vma_pages_range(vma, start, start + size); + vma->vm_flags = saved_flags; + } + mmu_notifier_invalidate_range_start(mm, start, start + size); err = populate_range(mm, vma, start, size, pgoff); mmu_notifier_invalidate_range_end(mm, start, start + size); if (!err && !(flags & MAP_NONBLOCK)) { - if (unlikely(has_write_lock)) { - downgrade_write(&mm->mmap_sem); - has_write_lock = 0; + if (vma->vm_flags & VM_LOCKED) { + /* + * might be mapping previously unmapped range of file + */ + mlock_vma_pages_range(vma, start, start + size); + } else { + if (unlikely(has_write_lock)) { + downgrade_write(&mm->mmap_sem); + has_write_lock = 0; + } + make_pages_present(start, start+size); } - make_pages_present(start, start+size); } /* @@ -240,4 +258,3 @@ out: return err; } - diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 67a7119..ce8cbb2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -262,7 +262,7 @@ struct resv_map { struct list_head regions; }; -struct resv_map *resv_map_alloc(void) +static struct resv_map *resv_map_alloc(void) { struct resv_map *resv_map = kmalloc(sizeof(*resv_map), GFP_KERNEL); if (!resv_map) @@ -274,7 +274,7 @@ struct resv_map *resv_map_alloc(void) return resv_map; } -void resv_map_release(struct kref *ref) +static void resv_map_release(struct kref *ref) { struct resv_map *resv_map = container_of(ref, struct resv_map, refs); @@ -289,7 +289,7 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) if (!(vma->vm_flags & VM_SHARED)) return (struct resv_map *)(get_vma_private_data(vma) & ~HPAGE_RESV_MASK); - return 0; + return NULL; } static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) @@ -1459,11 +1459,11 @@ int hugetlb_report_meminfo(char *buf) { struct hstate *h = &default_hstate; return sprintf(buf, - "HugePages_Total: %5lu\n" - "HugePages_Free: %5lu\n" - "HugePages_Rsvd: %5lu\n" - "HugePages_Surp: %5lu\n" - "Hugepagesize: %5lu kB\n", + "HugePages_Total: %5lu\n" + "HugePages_Free: %5lu\n" + "HugePages_Rsvd: %5lu\n" + "HugePages_Surp: %5lu\n" + "Hugepagesize: %8lu kB\n", h->nr_huge_pages, h->free_huge_pages, h->resv_huge_pages, @@ -1747,10 +1747,8 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, * from other VMAs and let the children be SIGKILLed if they are faulting the * same region. */ -int unmap_ref_private(struct mm_struct *mm, - struct vm_area_struct *vma, - struct page *page, - unsigned long address) +static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma, + struct page *page, unsigned long address) { struct vm_area_struct *iter_vma; struct address_space *mapping; @@ -2008,7 +2006,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = huge_ptep_get(ptep); if (huge_pte_none(entry)) { ret = hugetlb_no_page(mm, vma, address, ptep, write_access); - goto out_unlock; + goto out_mutex; } ret = 0; @@ -2024,7 +2022,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (write_access && !pte_write(entry)) { if (vma_needs_reservation(h, vma, address) < 0) { ret = VM_FAULT_OOM; - goto out_unlock; + goto out_mutex; } if (!(vma->vm_flags & VM_SHARED)) @@ -2034,10 +2032,23 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, spin_lock(&mm->page_table_lock); /* Check for a racing update before calling hugetlb_cow */ - if (likely(pte_same(entry, huge_ptep_get(ptep)))) - if (write_access && !pte_write(entry)) + if (unlikely(!pte_same(entry, huge_ptep_get(ptep)))) + goto out_page_table_lock; + + + if (write_access) { + if (!pte_write(entry)) { ret = hugetlb_cow(mm, vma, address, ptep, entry, pagecache_page); + goto out_page_table_lock; + } + entry = pte_mkdirty(entry); + } + entry = pte_mkyoung(entry); + if (huge_ptep_set_access_flags(vma, address, ptep, entry, write_access)) + update_mmu_cache(vma, address, entry); + +out_page_table_lock: spin_unlock(&mm->page_table_lock); if (pagecache_page) { @@ -2045,7 +2056,7 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, put_page(pagecache_page); } -out_unlock: +out_mutex: mutex_unlock(&hugetlb_instantiation_mutex); return ret; @@ -2060,6 +2071,14 @@ follow_huge_pud(struct mm_struct *mm, unsigned long address, return NULL; } +static int huge_zeropage_ok(pte_t *ptep, int write, int shared) +{ + if (!ptep || write || shared) + return 0; + else + return huge_pte_none(huge_ptep_get(ptep)); +} + int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page **pages, struct vm_area_struct **vmas, unsigned long *position, int *length, int i, @@ -2069,6 +2088,8 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long vaddr = *position; int remainder = *length; struct hstate *h = hstate_vma(vma); + int zeropage_ok = 0; + int shared = vma->vm_flags & VM_SHARED; spin_lock(&mm->page_table_lock); while (vaddr < vma->vm_end && remainder) { @@ -2081,8 +2102,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, * first, for the page indexing below to work. */ pte = huge_pte_offset(mm, vaddr & huge_page_mask(h)); + if (huge_zeropage_ok(pte, write, shared)) + zeropage_ok = 1; - if (!pte || huge_pte_none(huge_ptep_get(pte)) || + if (!pte || + (huge_pte_none(huge_ptep_get(pte)) && !zeropage_ok) || (write && !pte_write(huge_ptep_get(pte)))) { int ret; @@ -2102,8 +2126,11 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, page = pte_page(huge_ptep_get(pte)); same_page: if (pages) { - get_page(page); - pages[i] = page + pfn_offset; + if (zeropage_ok) + pages[i] = ZERO_PAGE(0); + else + pages[i] = page + pfn_offset; + get_page(pages[i]); } if (vmas) diff --git a/mm/internal.h b/mm/internal.h index 1f43f74..e4e728b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -39,6 +39,15 @@ static inline void __put_page(struct page *page) atomic_dec(&page->_count); } +/* + * in mm/vmscan.c: + */ +extern int isolate_lru_page(struct page *page); +extern void putback_lru_page(struct page *page); + +/* + * in mm/page_alloc.c + */ extern void __free_pages_bootmem(struct page *page, unsigned int order); /* @@ -52,6 +61,120 @@ static inline unsigned long page_order(struct page *page) return page_private(page); } +extern long mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end); +extern void munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end); +static inline void munlock_vma_pages_all(struct vm_area_struct *vma) +{ + munlock_vma_pages_range(vma, vma->vm_start, vma->vm_end); +} + +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * unevictable_migrate_page() called only from migrate_page_copy() to + * migrate unevictable flag to new page. + * Note that the old page has been isolated from the LRU lists at this + * point so we don't need to worry about LRU statistics. + */ +static inline void unevictable_migrate_page(struct page *new, struct page *old) +{ + if (TestClearPageUnevictable(old)) + SetPageUnevictable(new); +} +#else +static inline void unevictable_migrate_page(struct page *new, struct page *old) +{ +} +#endif + +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * Called only in fault path via page_evictable() for a new page + * to determine if it's being mapped into a LOCKED vma. + * If so, mark page as mlocked. + */ +static inline int is_mlocked_vma(struct vm_area_struct *vma, struct page *page) +{ + VM_BUG_ON(PageLRU(page)); + + if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED)) + return 0; + + if (!TestSetPageMlocked(page)) { + inc_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGMLOCKED); + } + return 1; +} + +/* + * must be called with vma's mmap_sem held for read, and page locked. + */ +extern void mlock_vma_page(struct page *page); + +/* + * Clear the page's PageMlocked(). This can be useful in a situation where + * we want to unconditionally remove a page from the pagecache -- e.g., + * on truncation or freeing. + * + * It is legal to call this function for any page, mlocked or not. + * If called for a page that is still mapped by mlocked vmas, all we do + * is revert to lazy LRU behaviour -- semantics are not broken. + */ +extern void __clear_page_mlock(struct page *page); +static inline void clear_page_mlock(struct page *page) +{ + if (unlikely(TestClearPageMlocked(page))) + __clear_page_mlock(page); +} + +/* + * mlock_migrate_page - called only from migrate_page_copy() to + * migrate the Mlocked page flag; update statistics. + */ +static inline void mlock_migrate_page(struct page *newpage, struct page *page) +{ + if (TestClearPageMlocked(page)) { + unsigned long flags; + + local_irq_save(flags); + __dec_zone_page_state(page, NR_MLOCK); + SetPageMlocked(newpage); + __inc_zone_page_state(newpage, NR_MLOCK); + local_irq_restore(flags); + } +} + +/* + * free_page_mlock() -- clean up attempts to free and mlocked() page. + * Page should not be on lru, so no need to fix that up. + * free_pages_check() will verify... + */ +static inline void free_page_mlock(struct page *page) +{ + if (unlikely(TestClearPageMlocked(page))) { + unsigned long flags; + + local_irq_save(flags); + __dec_zone_page_state(page, NR_MLOCK); + __count_vm_event(UNEVICTABLE_MLOCKFREED); + local_irq_restore(flags); + } +} + +#else /* CONFIG_UNEVICTABLE_LRU */ +static inline int is_mlocked_vma(struct vm_area_struct *v, struct page *p) +{ + return 0; +} +static inline void clear_page_mlock(struct page *page) { } +static inline void mlock_vma_page(struct page *page) { } +static inline void mlock_migrate_page(struct page *new, struct page *old) { } +static inline void free_page_mlock(struct page *page) { } + +#endif /* CONFIG_UNEVICTABLE_LRU */ + /* * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, * so all functions starting at paging_init should be marked __init @@ -120,4 +243,12 @@ static inline void mminit_validate_memmodel_limits(unsigned long *start_pfn, } #endif /* CONFIG_SPARSEMEM */ +#define GUP_FLAGS_WRITE 0x1 +#define GUP_FLAGS_FORCE 0x2 +#define GUP_FLAGS_IGNORE_VMA_PERMISSIONS 0x4 + +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int flags, + struct page **pages, struct vm_area_struct **vmas); + #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 36896f3..d4a92b6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -32,11 +32,12 @@ #include <linux/fs.h> #include <linux/seq_file.h> #include <linux/vmalloc.h> +#include <linux/mm_inline.h> +#include <linux/page_cgroup.h> #include <asm/uaccess.h> struct cgroup_subsys mem_cgroup_subsys __read_mostly; -static struct kmem_cache *page_cgroup_cache __read_mostly; #define MEM_CGROUP_RECLAIM_RETRIES 5 /* @@ -65,11 +66,10 @@ struct mem_cgroup_stat { /* * For accounting under irq disable, no need for increment preempt count. */ -static void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat *stat, +static inline void __mem_cgroup_stat_add_safe(struct mem_cgroup_stat_cpu *stat, enum mem_cgroup_stat_index idx, int val) { - int cpu = smp_processor_id(); - stat->cpustat[cpu].count[idx] += val; + stat->count[idx] += val; } static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, @@ -85,22 +85,13 @@ static s64 mem_cgroup_read_stat(struct mem_cgroup_stat *stat, /* * per-zone information in memory controller. */ - -enum mem_cgroup_zstat_index { - MEM_CGROUP_ZSTAT_ACTIVE, - MEM_CGROUP_ZSTAT_INACTIVE, - - NR_MEM_CGROUP_ZSTAT, -}; - struct mem_cgroup_per_zone { /* * spin_lock to protect the per cgroup LRU */ spinlock_t lru_lock; - struct list_head active_list; - struct list_head inactive_list; - unsigned long count[NR_MEM_CGROUP_ZSTAT]; + struct list_head lists[NR_LRU_LISTS]; + unsigned long count[NR_LRU_LISTS]; }; /* Macro for accessing counter */ #define MEM_CGROUP_ZSTAT(mz, idx) ((mz)->count[(idx)]) @@ -144,69 +135,52 @@ struct mem_cgroup { }; static struct mem_cgroup init_mem_cgroup; -/* - * We use the lower bit of the page->page_cgroup pointer as a bit spin - * lock. We need to ensure that page->page_cgroup is at least two - * byte aligned (based on comments from Nick Piggin). But since - * bit_spin_lock doesn't actually set that lock bit in a non-debug - * uniprocessor kernel, we should avoid setting it here too. - */ -#define PAGE_CGROUP_LOCK_BIT 0x0 -#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) -#define PAGE_CGROUP_LOCK (1 << PAGE_CGROUP_LOCK_BIT) -#else -#define PAGE_CGROUP_LOCK 0x0 -#endif - -/* - * A page_cgroup page is associated with every page descriptor. The - * page_cgroup helps us identify information about the cgroup - */ -struct page_cgroup { - struct list_head lru; /* per cgroup LRU list */ - struct page *page; - struct mem_cgroup *mem_cgroup; - int flags; -}; -#define PAGE_CGROUP_FLAG_CACHE (0x1) /* charged as cache */ -#define PAGE_CGROUP_FLAG_ACTIVE (0x2) /* page is active in this cgroup */ - -static int page_cgroup_nid(struct page_cgroup *pc) -{ - return page_to_nid(pc->page); -} - -static enum zone_type page_cgroup_zid(struct page_cgroup *pc) -{ - return page_zonenum(pc->page); -} - enum charge_type { MEM_CGROUP_CHARGE_TYPE_CACHE = 0, MEM_CGROUP_CHARGE_TYPE_MAPPED, + MEM_CGROUP_CHARGE_TYPE_SHMEM, /* used by page migration of shmem */ MEM_CGROUP_CHARGE_TYPE_FORCE, /* used by force_empty */ + NR_CHARGE_TYPE, +}; + +/* only for here (for easy reading.) */ +#define PCGF_CACHE (1UL << PCG_CACHE) +#define PCGF_USED (1UL << PCG_USED) +#define PCGF_ACTIVE (1UL << PCG_ACTIVE) +#define PCGF_LOCK (1UL << PCG_LOCK) +#define PCGF_FILE (1UL << PCG_FILE) +static const unsigned long +pcg_default_flags[NR_CHARGE_TYPE] = { + PCGF_CACHE | PCGF_FILE | PCGF_USED | PCGF_LOCK, /* File Cache */ + PCGF_ACTIVE | PCGF_USED | PCGF_LOCK, /* Anon */ + PCGF_ACTIVE | PCGF_CACHE | PCGF_USED | PCGF_LOCK, /* Shmem */ + 0, /* FORCE */ }; /* * Always modified under lru lock. Then, not necessary to preempt_disable() */ -static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, int flags, - bool charge) +static void mem_cgroup_charge_statistics(struct mem_cgroup *mem, + struct page_cgroup *pc, + bool charge) { int val = (charge)? 1 : -1; struct mem_cgroup_stat *stat = &mem->stat; + struct mem_cgroup_stat_cpu *cpustat; VM_BUG_ON(!irqs_disabled()); - if (flags & PAGE_CGROUP_FLAG_CACHE) - __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_CACHE, val); + + cpustat = &stat->cpustat[smp_processor_id()]; + if (PageCgroupCache(pc)) + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_CACHE, val); else - __mem_cgroup_stat_add_safe(stat, MEM_CGROUP_STAT_RSS, val); + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_RSS, val); if (charge) - __mem_cgroup_stat_add_safe(stat, + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_PGPGIN_COUNT, 1); else - __mem_cgroup_stat_add_safe(stat, + __mem_cgroup_stat_add_safe(cpustat, MEM_CGROUP_STAT_PGPGOUT_COUNT, 1); } @@ -227,7 +201,7 @@ page_cgroup_zoneinfo(struct page_cgroup *pc) } static unsigned long mem_cgroup_get_all_zonestat(struct mem_cgroup *mem, - enum mem_cgroup_zstat_index idx) + enum lru_list idx) { int nid, zid; struct mem_cgroup_per_zone *mz; @@ -262,85 +236,77 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) struct mem_cgroup, css); } -static inline int page_cgroup_locked(struct page *page) -{ - return bit_spin_is_locked(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); -} - -static void page_assign_page_cgroup(struct page *page, struct page_cgroup *pc) -{ - VM_BUG_ON(!page_cgroup_locked(page)); - page->page_cgroup = ((unsigned long)pc | PAGE_CGROUP_LOCK); -} - -struct page_cgroup *page_get_page_cgroup(struct page *page) -{ - return (struct page_cgroup *) (page->page_cgroup & ~PAGE_CGROUP_LOCK); -} - -static void lock_page_cgroup(struct page *page) -{ - bit_spin_lock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); -} - -static int try_lock_page_cgroup(struct page *page) -{ - return bit_spin_trylock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); -} - -static void unlock_page_cgroup(struct page *page) -{ - bit_spin_unlock(PAGE_CGROUP_LOCK_BIT, &page->page_cgroup); -} - static void __mem_cgroup_remove_list(struct mem_cgroup_per_zone *mz, struct page_cgroup *pc) { - int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; + int lru = LRU_BASE; + + if (PageCgroupUnevictable(pc)) + lru = LRU_UNEVICTABLE; + else { + if (PageCgroupActive(pc)) + lru += LRU_ACTIVE; + if (PageCgroupFile(pc)) + lru += LRU_FILE; + } - if (from) - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; - else - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; + MEM_CGROUP_ZSTAT(mz, lru) -= 1; - mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, false); + mem_cgroup_charge_statistics(pc->mem_cgroup, pc, false); list_del(&pc->lru); } static void __mem_cgroup_add_list(struct mem_cgroup_per_zone *mz, struct page_cgroup *pc) { - int to = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; - - if (!to) { - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; - list_add(&pc->lru, &mz->inactive_list); - } else { - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; - list_add(&pc->lru, &mz->active_list); + int lru = LRU_BASE; + + if (PageCgroupUnevictable(pc)) + lru = LRU_UNEVICTABLE; + else { + if (PageCgroupActive(pc)) + lru += LRU_ACTIVE; + if (PageCgroupFile(pc)) + lru += LRU_FILE; } - mem_cgroup_charge_statistics(pc->mem_cgroup, pc->flags, true); + + MEM_CGROUP_ZSTAT(mz, lru) += 1; + list_add(&pc->lru, &mz->lists[lru]); + + mem_cgroup_charge_statistics(pc->mem_cgroup, pc, true); } -static void __mem_cgroup_move_lists(struct page_cgroup *pc, bool active) +static void __mem_cgroup_move_lists(struct page_cgroup *pc, enum lru_list lru) { - int from = pc->flags & PAGE_CGROUP_FLAG_ACTIVE; struct mem_cgroup_per_zone *mz = page_cgroup_zoneinfo(pc); + int active = PageCgroupActive(pc); + int file = PageCgroupFile(pc); + int unevictable = PageCgroupUnevictable(pc); + enum lru_list from = unevictable ? LRU_UNEVICTABLE : + (LRU_FILE * !!file + !!active); - if (from) - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) -= 1; - else - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) -= 1; + if (lru == from) + return; - if (active) { - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE) += 1; - pc->flags |= PAGE_CGROUP_FLAG_ACTIVE; - list_move(&pc->lru, &mz->active_list); + MEM_CGROUP_ZSTAT(mz, from) -= 1; + /* + * However this is done under mz->lru_lock, another flags, which + * are not related to LRU, will be modified from out-of-lock. + * We have to use atomic set/clear flags. + */ + if (is_unevictable_lru(lru)) { + ClearPageCgroupActive(pc); + SetPageCgroupUnevictable(pc); } else { - MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE) += 1; - pc->flags &= ~PAGE_CGROUP_FLAG_ACTIVE; - list_move(&pc->lru, &mz->inactive_list); + if (is_active_lru(lru)) + SetPageCgroupActive(pc); + else + ClearPageCgroupActive(pc); + ClearPageCgroupUnevictable(pc); } + + MEM_CGROUP_ZSTAT(mz, lru) += 1; + list_move(&pc->lru, &mz->lists[lru]); } int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) @@ -356,7 +322,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) /* * This routine assumes that the appropriate zone's lru lock is already held */ -void mem_cgroup_move_lists(struct page *page, bool active) +void mem_cgroup_move_lists(struct page *page, enum lru_list lru) { struct page_cgroup *pc; struct mem_cgroup_per_zone *mz; @@ -372,17 +338,16 @@ void mem_cgroup_move_lists(struct page *page, bool active) * safely get to page_cgroup without it, so just try_lock it: * mem_cgroup_isolate_pages allows for page left on wrong list. */ - if (!try_lock_page_cgroup(page)) + pc = lookup_page_cgroup(page); + if (!trylock_page_cgroup(pc)) return; - - pc = page_get_page_cgroup(page); - if (pc) { + if (pc && PageCgroupUsed(pc)) { mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); - __mem_cgroup_move_lists(pc, active); + __mem_cgroup_move_lists(pc, lru); spin_unlock_irqrestore(&mz->lru_lock, flags); } - unlock_page_cgroup(page); + unlock_page_cgroup(pc); } /* @@ -403,21 +368,6 @@ int mem_cgroup_calc_mapped_ratio(struct mem_cgroup *mem) } /* - * This function is called from vmscan.c. In page reclaiming loop. balance - * between active and inactive list is calculated. For memory controller - * page reclaiming, we should use using mem_cgroup's imbalance rather than - * zone's global lru imbalance. - */ -long mem_cgroup_reclaim_imbalance(struct mem_cgroup *mem) -{ - unsigned long active, inactive; - /* active and inactive are the number of pages. 'long' is ok.*/ - active = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_ACTIVE); - inactive = mem_cgroup_get_all_zonestat(mem, MEM_CGROUP_ZSTAT_INACTIVE); - return (long) (active / (inactive + 1)); -} - -/* * prev_priority control...this will be used in memory reclaim path. */ int mem_cgroup_get_reclaim_priority(struct mem_cgroup *mem) @@ -444,28 +394,17 @@ void mem_cgroup_record_reclaim_priority(struct mem_cgroup *mem, int priority) * (see include/linux/mmzone.h) */ -long mem_cgroup_calc_reclaim_active(struct mem_cgroup *mem, - struct zone *zone, int priority) +long mem_cgroup_calc_reclaim(struct mem_cgroup *mem, struct zone *zone, + int priority, enum lru_list lru) { - long nr_active; + long nr_pages; int nid = zone->zone_pgdat->node_id; int zid = zone_idx(zone); struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); - nr_active = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_ACTIVE); - return (nr_active >> priority); -} + nr_pages = MEM_CGROUP_ZSTAT(mz, lru); -long mem_cgroup_calc_reclaim_inactive(struct mem_cgroup *mem, - struct zone *zone, int priority) -{ - long nr_inactive; - int nid = zone->zone_pgdat->node_id; - int zid = zone_idx(zone); - struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(mem, nid, zid); - - nr_inactive = MEM_CGROUP_ZSTAT(mz, MEM_CGROUP_ZSTAT_INACTIVE); - return (nr_inactive >> priority); + return (nr_pages >> priority); } unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, @@ -473,7 +412,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, unsigned long *scanned, int order, int mode, struct zone *z, struct mem_cgroup *mem_cont, - int active) + int active, int file) { unsigned long nr_taken = 0; struct page *page; @@ -484,38 +423,38 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, int nid = z->zone_pgdat->node_id; int zid = zone_idx(z); struct mem_cgroup_per_zone *mz; + int lru = LRU_FILE * !!file + !!active; BUG_ON(!mem_cont); mz = mem_cgroup_zoneinfo(mem_cont, nid, zid); - if (active) - src = &mz->active_list; - else - src = &mz->inactive_list; - + src = &mz->lists[lru]; spin_lock(&mz->lru_lock); scan = 0; list_for_each_entry_safe_reverse(pc, tmp, src, lru) { if (scan >= nr_to_scan) break; + if (unlikely(!PageCgroupUsed(pc))) + continue; page = pc->page; if (unlikely(!PageLRU(page))) continue; - if (PageActive(page) && !active) { - __mem_cgroup_move_lists(pc, true); - continue; - } - if (!PageActive(page) && active) { - __mem_cgroup_move_lists(pc, false); + /* + * TODO: play better with lumpy reclaim, grabbing anything. + */ + if (PageUnevictable(page) || + (PageActive(page) && !active) || + (!PageActive(page) && active)) { + __mem_cgroup_move_lists(pc, page_lru(page)); continue; } scan++; list_move(&pc->lru, &pc_list); - if (__isolate_lru_page(page, mode) == 0) { + if (__isolate_lru_page(page, mode, file) == 0) { list_move(&page->lru, dst); nr_taken++; } @@ -540,26 +479,27 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, { struct mem_cgroup *mem; struct page_cgroup *pc; - unsigned long flags; unsigned long nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup_per_zone *mz; + unsigned long flags; - pc = kmem_cache_alloc(page_cgroup_cache, gfp_mask); - if (unlikely(pc == NULL)) - goto err; - + pc = lookup_page_cgroup(page); + /* can happen at boot */ + if (unlikely(!pc)) + return 0; + prefetchw(pc); /* * We always charge the cgroup the mm_struct belongs to. * The mm_struct's mem_cgroup changes on task migration if the * thread group leader migrates. It's possible that mm is not * set, if so charge the init_mm (happens for pagecache usage). */ + if (likely(!memcg)) { rcu_read_lock(); mem = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (unlikely(!mem)) { rcu_read_unlock(); - kmem_cache_free(page_cgroup_cache, pc); return 0; } /* @@ -572,7 +512,7 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, css_get(&memcg->css); } - while (res_counter_charge(&mem->res, PAGE_SIZE)) { + while (unlikely(res_counter_charge(&mem->res, PAGE_SIZE))) { if (!(gfp_mask & __GFP_WAIT)) goto out; @@ -595,39 +535,33 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, } } - pc->mem_cgroup = mem; - pc->page = page; - /* - * If a page is accounted as a page cache, insert to inactive list. - * If anon, insert to active list. - */ - if (ctype == MEM_CGROUP_CHARGE_TYPE_CACHE) - pc->flags = PAGE_CGROUP_FLAG_CACHE; - else - pc->flags = PAGE_CGROUP_FLAG_ACTIVE; - lock_page_cgroup(page); - if (unlikely(page_get_page_cgroup(page))) { - unlock_page_cgroup(page); + lock_page_cgroup(pc); + if (unlikely(PageCgroupUsed(pc))) { + unlock_page_cgroup(pc); res_counter_uncharge(&mem->res, PAGE_SIZE); css_put(&mem->css); - kmem_cache_free(page_cgroup_cache, pc); + goto done; } - page_assign_page_cgroup(page, pc); + pc->mem_cgroup = mem; + /* + * If a page is accounted as a page cache, insert to inactive list. + * If anon, insert to active list. + */ + pc->flags = pcg_default_flags[ctype]; mz = page_cgroup_zoneinfo(pc); + spin_lock_irqsave(&mz->lru_lock, flags); __mem_cgroup_add_list(mz, pc); spin_unlock_irqrestore(&mz->lru_lock, flags); + unlock_page_cgroup(pc); - unlock_page_cgroup(page); done: return 0; out: css_put(&mem->css); - kmem_cache_free(page_cgroup_cache, pc); -err: return -ENOMEM; } @@ -635,7 +569,8 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) { if (mem_cgroup_subsys.disabled) return 0; - + if (PageCompound(page)) + return 0; /* * If already mapped, we don't have to account. * If page cache, page->mapping has address_space. @@ -656,7 +591,8 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, { if (mem_cgroup_subsys.disabled) return 0; - + if (PageCompound(page)) + return 0; /* * Corner case handling. This is called from add_to_page_cache() * in usual. But some FS (shmem) precharges this page before calling it @@ -669,22 +605,27 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (!(gfp_mask & __GFP_WAIT)) { struct page_cgroup *pc; - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - if (pc) { - VM_BUG_ON(pc->page != page); - VM_BUG_ON(!pc->mem_cgroup); - unlock_page_cgroup(page); + + pc = lookup_page_cgroup(page); + if (!pc) + return 0; + lock_page_cgroup(pc); + if (PageCgroupUsed(pc)) { + unlock_page_cgroup(pc); return 0; } - unlock_page_cgroup(page); + unlock_page_cgroup(pc); } if (unlikely(!mm)) mm = &init_mm; - return mem_cgroup_charge_common(page, mm, gfp_mask, + if (page_is_file_cache(page)) + return mem_cgroup_charge_common(page, mm, gfp_mask, MEM_CGROUP_CHARGE_TYPE_CACHE, NULL); + else + return mem_cgroup_charge_common(page, mm, gfp_mask, + MEM_CGROUP_CHARGE_TYPE_SHMEM, NULL); } /* @@ -704,44 +645,46 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype) /* * Check if our page_cgroup is valid */ - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - if (unlikely(!pc)) - goto unlock; - - VM_BUG_ON(pc->page != page); + pc = lookup_page_cgroup(page); + if (unlikely(!pc || !PageCgroupUsed(pc))) + return; - if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED) - && ((pc->flags & PAGE_CGROUP_FLAG_CACHE) - || page_mapped(page))) - goto unlock; + lock_page_cgroup(pc); + if ((ctype == MEM_CGROUP_CHARGE_TYPE_MAPPED && page_mapped(page)) + || !PageCgroupUsed(pc)) { + /* This happens at race in zap_pte_range() and do_swap_page()*/ + unlock_page_cgroup(pc); + return; + } + ClearPageCgroupUsed(pc); + mem = pc->mem_cgroup; mz = page_cgroup_zoneinfo(pc); spin_lock_irqsave(&mz->lru_lock, flags); __mem_cgroup_remove_list(mz, pc); spin_unlock_irqrestore(&mz->lru_lock, flags); + unlock_page_cgroup(pc); - page_assign_page_cgroup(page, NULL); - unlock_page_cgroup(page); - - mem = pc->mem_cgroup; res_counter_uncharge(&mem->res, PAGE_SIZE); css_put(&mem->css); - kmem_cache_free(page_cgroup_cache, pc); return; -unlock: - unlock_page_cgroup(page); } void mem_cgroup_uncharge_page(struct page *page) { + /* early check. */ + if (page_mapped(page)) + return; + if (page->mapping && !PageAnon(page)) + return; __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_MAPPED); } void mem_cgroup_uncharge_cache_page(struct page *page) { VM_BUG_ON(page_mapped(page)); + VM_BUG_ON(page->mapping); __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE); } @@ -758,15 +701,19 @@ int mem_cgroup_prepare_migration(struct page *page, struct page *newpage) if (mem_cgroup_subsys.disabled) return 0; - lock_page_cgroup(page); - pc = page_get_page_cgroup(page); - if (pc) { + pc = lookup_page_cgroup(page); + lock_page_cgroup(pc); + if (PageCgroupUsed(pc)) { mem = pc->mem_cgroup; css_get(&mem->css); - if (pc->flags & PAGE_CGROUP_FLAG_CACHE) - ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; + if (PageCgroupCache(pc)) { + if (page_is_file_cache(page)) + ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; + else + ctype = MEM_CGROUP_CHARGE_TYPE_SHMEM; + } } - unlock_page_cgroup(page); + unlock_page_cgroup(pc); if (mem) { ret = mem_cgroup_charge_common(newpage, NULL, GFP_KERNEL, ctype, mem); @@ -791,7 +738,7 @@ void mem_cgroup_end_migration(struct page *newpage) */ if (!newpage->mapping) __mem_cgroup_uncharge_common(newpage, - MEM_CGROUP_CHARGE_TYPE_FORCE); + MEM_CGROUP_CHARGE_TYPE_FORCE); else if (PageAnon(newpage)) mem_cgroup_uncharge_page(newpage); } @@ -863,7 +810,7 @@ int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long long val) #define FORCE_UNCHARGE_BATCH (128) static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, struct mem_cgroup_per_zone *mz, - int active) + enum lru_list lru) { struct page_cgroup *pc; struct page *page; @@ -871,15 +818,14 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, unsigned long flags; struct list_head *list; - if (active) - list = &mz->active_list; - else - list = &mz->inactive_list; + list = &mz->lists[lru]; spin_lock_irqsave(&mz->lru_lock, flags); while (!list_empty(list)) { pc = list_entry(list->prev, struct page_cgroup, lru); page = pc->page; + if (!PageCgroupUsed(pc)) + break; get_page(page); spin_unlock_irqrestore(&mz->lru_lock, flags); /* @@ -894,8 +840,10 @@ static void mem_cgroup_force_empty_list(struct mem_cgroup *mem, count = FORCE_UNCHARGE_BATCH; cond_resched(); } - } else - cond_resched(); + } else { + spin_lock_irqsave(&mz->lru_lock, flags); + break; + } spin_lock_irqsave(&mz->lru_lock, flags); } spin_unlock_irqrestore(&mz->lru_lock, flags); @@ -919,15 +867,17 @@ static int mem_cgroup_force_empty(struct mem_cgroup *mem) while (mem->res.usage > 0) { if (atomic_read(&mem->css.cgroup->count) > 0) goto out; + /* This is for making all *used* pages to be on LRU. */ + lru_add_drain_all(); for_each_node_state(node, N_POSSIBLE) for (zid = 0; zid < MAX_NR_ZONES; zid++) { struct mem_cgroup_per_zone *mz; + enum lru_list l; mz = mem_cgroup_zoneinfo(mem, node, zid); - /* drop all page_cgroup in active_list */ - mem_cgroup_force_empty_list(mem, mz, 1); - /* drop all page_cgroup in inactive_list */ - mem_cgroup_force_empty_list(mem, mz, 0); + for_each_lru(l) + mem_cgroup_force_empty_list(mem, mz, l); } + cond_resched(); } ret = 0; out: @@ -1012,14 +962,27 @@ static int mem_control_stat_show(struct cgroup *cont, struct cftype *cft, } /* showing # of active pages */ { - unsigned long active, inactive; - - inactive = mem_cgroup_get_all_zonestat(mem_cont, - MEM_CGROUP_ZSTAT_INACTIVE); - active = mem_cgroup_get_all_zonestat(mem_cont, - MEM_CGROUP_ZSTAT_ACTIVE); - cb->fill(cb, "active", (active) * PAGE_SIZE); - cb->fill(cb, "inactive", (inactive) * PAGE_SIZE); + unsigned long active_anon, inactive_anon; + unsigned long active_file, inactive_file; + unsigned long unevictable; + + inactive_anon = mem_cgroup_get_all_zonestat(mem_cont, + LRU_INACTIVE_ANON); + active_anon = mem_cgroup_get_all_zonestat(mem_cont, + LRU_ACTIVE_ANON); + inactive_file = mem_cgroup_get_all_zonestat(mem_cont, + LRU_INACTIVE_FILE); + active_file = mem_cgroup_get_all_zonestat(mem_cont, + LRU_ACTIVE_FILE); + unevictable = mem_cgroup_get_all_zonestat(mem_cont, + LRU_UNEVICTABLE); + + cb->fill(cb, "active_anon", (active_anon) * PAGE_SIZE); + cb->fill(cb, "inactive_anon", (inactive_anon) * PAGE_SIZE); + cb->fill(cb, "active_file", (active_file) * PAGE_SIZE); + cb->fill(cb, "inactive_file", (inactive_file) * PAGE_SIZE); + cb->fill(cb, "unevictable", unevictable * PAGE_SIZE); + } return 0; } @@ -1062,6 +1025,7 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) { struct mem_cgroup_per_node *pn; struct mem_cgroup_per_zone *mz; + enum lru_list l; int zone, tmp = node; /* * This routine is called against possible nodes. @@ -1082,9 +1046,9 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *mem, int node) for (zone = 0; zone < MAX_NR_ZONES; zone++) { mz = &pn->zoneinfo[zone]; - INIT_LIST_HEAD(&mz->active_list); - INIT_LIST_HEAD(&mz->inactive_list); spin_lock_init(&mz->lru_lock); + for_each_lru(l) + INIT_LIST_HEAD(&mz->lists[l]); } return 0; } @@ -1124,8 +1088,8 @@ mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) int node; if (unlikely((cont->parent) == NULL)) { + page_cgroup_init(); mem = &init_mem_cgroup; - page_cgroup_cache = KMEM_CACHE(page_cgroup, SLAB_PANIC); } else { mem = mem_cgroup_alloc(); if (!mem) diff --git a/mm/memory.c b/mm/memory.c index 1002f47..3a6c4a6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -64,6 +64,8 @@ #include "internal.h" +#include "internal.h" + #ifndef CONFIG_NEED_MULTIPLE_NODES /* use the per-pgdat data instead for discontigmem - mbligh */ unsigned long max_mapnr; @@ -1129,12 +1131,17 @@ static inline int use_zero_page(struct vm_area_struct *vma) return !vma->vm_ops || !vma->vm_ops->fault; } -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, + + +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int flags, struct page **pages, struct vm_area_struct **vmas) { int i; - unsigned int vm_flags; + unsigned int vm_flags = 0; + int write = !!(flags & GUP_FLAGS_WRITE); + int force = !!(flags & GUP_FLAGS_FORCE); + int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); if (len <= 0) return 0; @@ -1158,7 +1165,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pud_t *pud; pmd_t *pmd; pte_t *pte; - if (write) /* user gate pages are read-only */ + + /* user gate pages are read-only */ + if (!ignore && write) return i ? : -EFAULT; if (pg > TASK_SIZE) pgd = pgd_offset_k(pg); @@ -1190,8 +1199,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, continue; } - if (!vma || (vma->vm_flags & (VM_IO | VM_PFNMAP)) - || !(vm_flags & vma->vm_flags)) + if (!vma || + (vma->vm_flags & (VM_IO | VM_PFNMAP)) || + (!ignore && !(vm_flags & vma->vm_flags))) return i ? : -EFAULT; if (is_vm_hugetlb_page(vma)) { @@ -1266,6 +1276,23 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, } while (len); return i; } + +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + int flags = 0; + + if (write) + flags |= GUP_FLAGS_WRITE; + if (force) + flags |= GUP_FLAGS_FORCE; + + return __get_user_pages(tsk, mm, + start, len, flags, + pages, vmas); +} + EXPORT_SYMBOL(get_user_pages); pte_t *get_locked_pte(struct mm_struct *mm, unsigned long addr, @@ -1296,18 +1323,14 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, pte_t *pte; spinlock_t *ptl; - retval = mem_cgroup_charge(page, mm, GFP_KERNEL); - if (retval) - goto out; - retval = -EINVAL; if (PageAnon(page)) - goto out_uncharge; + goto out; retval = -ENOMEM; flush_dcache_page(page); pte = get_locked_pte(mm, addr, &ptl); if (!pte) - goto out_uncharge; + goto out; retval = -EBUSY; if (!pte_none(*pte)) goto out_unlock; @@ -1323,8 +1346,6 @@ static int insert_page(struct vm_area_struct *vma, unsigned long addr, return retval; out_unlock: pte_unmap_unlock(pte, ptl); -out_uncharge: - mem_cgroup_uncharge_page(page); out: return retval; } @@ -1858,6 +1879,15 @@ gotten: new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); if (!new_page) goto oom; + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if (vma->vm_flags & VM_LOCKED) { + lock_page(old_page); /* for LRU manipulation */ + clear_page_mlock(old_page); + unlock_page(old_page); + } cow_user_page(new_page, old_page, address, vma); __SetPageUptodate(new_page); @@ -1886,11 +1916,13 @@ gotten: * thread doing COW. */ ptep_clear_flush_notify(vma, address, page_table); - set_pte_at(mm, address, page_table, entry); - update_mmu_cache(vma, address, entry); - lru_cache_add_active(new_page); + SetPageSwapBacked(new_page); + lru_cache_add_active_or_unevictable(new_page, vma); page_add_new_anon_rmap(new_page, vma, address); +//TODO: is this safe? do_anonymous_page() does it this way. + set_pte_at(mm, address, page_table, entry); + update_mmu_cache(vma, address, entry); if (old_page) { /* * Only after switching the pte to the new page may @@ -2288,16 +2320,17 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(PGMAJFAULT); } + mark_page_accessed(page); + + lock_page(page); + delayacct_clear_flag(DELAYACCT_PF_SWAPIN); + if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); ret = VM_FAULT_OOM; + unlock_page(page); goto out; } - mark_page_accessed(page); - lock_page(page); - delayacct_clear_flag(DELAYACCT_PF_SWAPIN); - /* * Back out if somebody else already faulted in this pte. */ @@ -2324,7 +2357,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, page_add_anon_rmap(page, vma, address); swap_free(entry); - if (vm_swap_full()) + if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) remove_exclusive_swap_page(page); unlock_page(page); @@ -2382,7 +2415,8 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, if (!pte_none(*page_table)) goto release; inc_mm_counter(mm, anon_rss); - lru_cache_add_active(page); + SetPageSwapBacked(page); + lru_cache_add_active_or_unevictable(page, vma); page_add_new_anon_rmap(page, vma, address); set_pte_at(mm, address, page_table, entry); @@ -2423,6 +2457,7 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; pte_t entry; int anon = 0; + int charged = 0; struct page *dirty_page = NULL; struct vm_fault vmf; int ret; @@ -2463,6 +2498,18 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, ret = VM_FAULT_OOM; goto out; } + if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { + ret = VM_FAULT_OOM; + page_cache_release(page); + goto out; + } + charged = 1; + /* + * Don't let another task, with possibly unlocked vma, + * keep the mlocked page. + */ + if (vma->vm_flags & VM_LOCKED) + clear_page_mlock(vmf.page); copy_user_highpage(page, vmf.page, address, vma); __SetPageUptodate(page); } else { @@ -2497,11 +2544,6 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, } - if (mem_cgroup_charge(page, mm, GFP_KERNEL)) { - ret = VM_FAULT_OOM; - goto out; - } - page_table = pte_offset_map_lock(mm, pmd, address, &ptl); /* @@ -2520,11 +2562,11 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = mk_pte(page, vma->vm_page_prot); if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); - set_pte_at(mm, address, page_table, entry); if (anon) { - inc_mm_counter(mm, anon_rss); - lru_cache_add_active(page); - page_add_new_anon_rmap(page, vma, address); + inc_mm_counter(mm, anon_rss); + SetPageSwapBacked(page); + lru_cache_add_active_or_unevictable(page, vma); + page_add_new_anon_rmap(page, vma, address); } else { inc_mm_counter(mm, file_rss); page_add_file_rmap(page); @@ -2533,11 +2575,14 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, get_page(dirty_page); } } +//TODO: is this safe? do_anonymous_page() does it this way. + set_pte_at(mm, address, page_table, entry); /* no need to invalidate: a not-present page won't be cached */ update_mmu_cache(vma, address, entry); } else { - mem_cgroup_uncharge_page(page); + if (charged) + mem_cgroup_uncharge_page(page); if (anon) page_cache_release(page); else @@ -2772,19 +2817,9 @@ int make_pages_present(unsigned long addr, unsigned long end) len = DIV_ROUND_UP(end, PAGE_SIZE) - addr/PAGE_SIZE; ret = get_user_pages(current, current->mm, addr, len, write, 0, NULL, NULL); - if (ret < 0) { - /* - SUS require strange return value to mlock - - invalid addr generate to ENOMEM. - - out of memory should generate EAGAIN. - */ - if (ret == -EFAULT) - ret = -ENOMEM; - else if (ret == -ENOMEM) - ret = -EAGAIN; + if (ret < 0) return ret; - } - return ret == len ? 0 : -ENOMEM; + return ret == len ? 0 : -EFAULT; } #if !defined(__HAVE_ARCH_GATE_AREA) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 89fee2d..6837a10 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -26,6 +26,7 @@ #include <linux/delay.h> #include <linux/migrate.h> #include <linux/page-isolation.h> +#include <linux/pfn.h> #include <asm/tlbflush.h> @@ -323,11 +324,11 @@ int __remove_pages(struct zone *zone, unsigned long phys_start_pfn, BUG_ON(phys_start_pfn & ~PAGE_SECTION_MASK); BUG_ON(nr_pages % PAGES_PER_SECTION); - release_mem_region(phys_start_pfn << PAGE_SHIFT, nr_pages * PAGE_SIZE); - sections_to_remove = nr_pages / PAGES_PER_SECTION; for (i = 0; i < sections_to_remove; i++) { unsigned long pfn = phys_start_pfn + i*PAGES_PER_SECTION; + release_mem_region(pfn << PAGE_SHIFT, + PAGES_PER_SECTION << PAGE_SHIFT); ret = __remove_section(zone, __pfn_to_section(pfn)); if (ret) break; @@ -657,8 +658,9 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) * We can skip free pages. And we can only deal with pages on * LRU. */ - ret = isolate_lru_page(page, &source); + ret = isolate_lru_page(page); if (!ret) { /* Success */ + list_add_tail(&page->lru, &source); move_pages--; } else { /* Becasue we don't have big zone->lock. we should @@ -849,10 +851,19 @@ failed_removal: return ret; } + +int remove_memory(u64 start, u64 size) +{ + unsigned long start_pfn, end_pfn; + + start_pfn = PFN_DOWN(start); + end_pfn = start_pfn + PFN_DOWN(size); + return offline_pages(start_pfn, end_pfn, 120 * HZ); +} #else int remove_memory(u64 start, u64 size) { return -EINVAL; } -EXPORT_SYMBOL_GPL(remove_memory); #endif /* CONFIG_MEMORY_HOTREMOVE */ +EXPORT_SYMBOL_GPL(remove_memory); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 8336905..36f4257 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -93,6 +93,8 @@ #include <asm/tlbflush.h> #include <asm/uaccess.h> +#include "internal.h" + /* Internal flags */ #define MPOL_MF_DISCONTIG_OK (MPOL_MF_INTERNAL << 0) /* Skip checks for continuous vmas */ #define MPOL_MF_INVERT (MPOL_MF_INTERNAL << 1) /* Invert check for nodemask */ @@ -762,8 +764,11 @@ static void migrate_page_add(struct page *page, struct list_head *pagelist, /* * Avoid migrating a page that is shared with others. */ - if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) - isolate_lru_page(page, pagelist); + if ((flags & MPOL_MF_MOVE_ALL) || page_mapcount(page) == 1) { + if (!isolate_lru_page(page)) { + list_add_tail(&page->lru, pagelist); + } + } } static struct page *new_node_page(struct page *page, unsigned long node, int **x) @@ -2197,7 +2202,7 @@ static void gather_stats(struct page *page, void *private, int pte_dirty) if (PageSwapCache(page)) md->swapcache++; - if (PageActive(page)) + if (PageActive(page) || PageUnevictable(page)) md->active++; if (PageWriteback(page)) diff --git a/mm/migrate.c b/mm/migrate.c index 2a80136..6602941b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -37,36 +37,6 @@ #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) /* - * Isolate one page from the LRU lists. If successful put it onto - * the indicated list with elevated page count. - * - * Result: - * -EBUSY: page not on LRU list - * 0: page removed from LRU list and added to the specified list. - */ -int isolate_lru_page(struct page *page, struct list_head *pagelist) -{ - int ret = -EBUSY; - - if (PageLRU(page)) { - struct zone *zone = page_zone(page); - - spin_lock_irq(&zone->lru_lock); - if (PageLRU(page) && get_page_unless_zero(page)) { - ret = 0; - ClearPageLRU(page); - if (PageActive(page)) - del_page_from_active_list(zone, page); - else - del_page_from_inactive_list(zone, page); - list_add_tail(&page->lru, pagelist); - } - spin_unlock_irq(&zone->lru_lock); - } - return ret; -} - -/* * migrate_prep() needs to be called before we start compiling a list of pages * to be migrated using isolate_lru_page(). */ @@ -83,23 +53,9 @@ int migrate_prep(void) return 0; } -static inline void move_to_lru(struct page *page) -{ - if (PageActive(page)) { - /* - * lru_cache_add_active checks that - * the PG_active bit is off. - */ - ClearPageActive(page); - lru_cache_add_active(page); - } else { - lru_cache_add(page); - } - put_page(page); -} - /* - * Add isolated pages on the list back to the LRU. + * Add isolated pages on the list back to the LRU under page lock + * to avoid leaking evictable pages back onto unevictable list. * * returns the number of pages put back. */ @@ -111,7 +67,7 @@ int putback_lru_pages(struct list_head *l) list_for_each_entry_safe(page, page2, l, lru) { list_del(&page->lru); - move_to_lru(page); + putback_lru_page(page); count++; } return count; @@ -374,8 +330,6 @@ static int migrate_page_move_mapping(struct address_space *mapping, __inc_zone_page_state(newpage, NR_FILE_PAGES); spin_unlock_irq(&mapping->tree_lock); - if (!PageSwapCache(newpage)) - mem_cgroup_uncharge_cache_page(page); return 0; } @@ -385,6 +339,8 @@ static int migrate_page_move_mapping(struct address_space *mapping, */ static void migrate_page_copy(struct page *newpage, struct page *page) { + int anon; + copy_highpage(newpage, page); if (PageError(page)) @@ -393,8 +349,11 @@ static void migrate_page_copy(struct page *newpage, struct page *page) SetPageReferenced(newpage); if (PageUptodate(page)) SetPageUptodate(newpage); - if (PageActive(page)) + if (TestClearPageActive(page)) { + VM_BUG_ON(PageUnevictable(page)); SetPageActive(newpage); + } else + unevictable_migrate_page(newpage, page); if (PageChecked(page)) SetPageChecked(newpage); if (PageMappedToDisk(page)) @@ -412,14 +371,20 @@ static void migrate_page_copy(struct page *newpage, struct page *page) __set_page_dirty_nobuffers(newpage); } + mlock_migrate_page(newpage, page); + #ifdef CONFIG_SWAP ClearPageSwapCache(page); #endif - ClearPageActive(page); ClearPagePrivate(page); set_page_private(page, 0); + /* page->mapping contains a flag for PageAnon() */ + anon = PageAnon(page); page->mapping = NULL; + if (!anon) /* This page was removed from radix-tree. */ + mem_cgroup_uncharge_cache_page(page); + /* * If any waiters have accumulated on the new page then * wake them up. @@ -594,6 +559,10 @@ static int fallback_migrate_page(struct address_space *mapping, * * The new page will have replaced the old page if this function * is successful. + * + * Return value: + * < 0 - error code + * == 0 - success */ static int move_to_new_page(struct page *newpage, struct page *page) { @@ -611,6 +580,8 @@ static int move_to_new_page(struct page *newpage, struct page *page) /* Prepare mapping for the new page.*/ newpage->index = page->index; newpage->mapping = page->mapping; + if (PageSwapBacked(page)) + SetPageSwapBacked(newpage); mapping = page_mapping(page); if (!mapping) @@ -654,9 +625,10 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, if (!newpage) return -ENOMEM; - if (page_count(page) == 1) + if (page_count(page) == 1) { /* page was freed from under us. So we are done. */ goto move_newpage; + } charge = mem_cgroup_prepare_migration(page, newpage); if (charge == -ENOMEM) { @@ -730,7 +702,6 @@ rcu_unlock: rcu_read_unlock(); unlock: - unlock_page(page); if (rc != -EAGAIN) { @@ -741,17 +712,19 @@ unlock: * restored. */ list_del(&page->lru); - move_to_lru(page); + putback_lru_page(page); } move_newpage: if (!charge) mem_cgroup_end_migration(newpage); + /* * Move the new page to the LRU. If migration was not successful * then this will free the page. */ - move_to_lru(newpage); + putback_lru_page(newpage); + if (result) { if (rc) *result = rc; @@ -858,9 +831,11 @@ static struct page *new_page_node(struct page *p, unsigned long private, * Move a set of pages as indicated in the pm array. The addr * field must be set to the virtual address of the page to be moved * and the node number must contain a valid target node. + * The pm array ends with node = MAX_NUMNODES. */ -static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, - int migrate_all) +static int do_move_page_to_node_array(struct mm_struct *mm, + struct page_to_node *pm, + int migrate_all) { int err; struct page_to_node *pp; @@ -914,7 +889,9 @@ static int do_move_pages(struct mm_struct *mm, struct page_to_node *pm, !migrate_all) goto put_and_set; - err = isolate_lru_page(page, &pagelist); + err = isolate_lru_page(page); + if (!err) + list_add_tail(&page->lru, &pagelist); put_and_set: /* * Either remove the duplicate refcount from @@ -926,36 +903,118 @@ set_status: pp->status = err; } + err = 0; if (!list_empty(&pagelist)) err = migrate_pages(&pagelist, new_page_node, (unsigned long)pm); - else - err = -ENOENT; up_read(&mm->mmap_sem); return err; } /* - * Determine the nodes of a list of pages. The addr in the pm array - * must have been set to the virtual address of which we want to determine - * the node number. + * Migrate an array of page address onto an array of nodes and fill + * the corresponding array of status. */ -static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) +static int do_pages_move(struct mm_struct *mm, struct task_struct *task, + unsigned long nr_pages, + const void __user * __user *pages, + const int __user *nodes, + int __user *status, int flags) { + struct page_to_node *pm = NULL; + nodemask_t task_nodes; + int err = 0; + int i; + + task_nodes = cpuset_mems_allowed(task); + + /* Limit nr_pages so that the multiplication may not overflow */ + if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { + err = -E2BIG; + goto out; + } + + pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); + if (!pm) { + err = -ENOMEM; + goto out; + } + + /* + * Get parameters from user space and initialize the pm + * array. Return various errors if the user did something wrong. + */ + for (i = 0; i < nr_pages; i++) { + const void __user *p; + + err = -EFAULT; + if (get_user(p, pages + i)) + goto out_pm; + + pm[i].addr = (unsigned long)p; + if (nodes) { + int node; + + if (get_user(node, nodes + i)) + goto out_pm; + + err = -ENODEV; + if (!node_state(node, N_HIGH_MEMORY)) + goto out_pm; + + err = -EACCES; + if (!node_isset(node, task_nodes)) + goto out_pm; + + pm[i].node = node; + } else + pm[i].node = 0; /* anything to not match MAX_NUMNODES */ + } + /* End marker */ + pm[nr_pages].node = MAX_NUMNODES; + + err = do_move_page_to_node_array(mm, pm, flags & MPOL_MF_MOVE_ALL); + if (err >= 0) + /* Return status information */ + for (i = 0; i < nr_pages; i++) + if (put_user(pm[i].status, status + i)) + err = -EFAULT; + +out_pm: + vfree(pm); +out: + return err; +} + +/* + * Determine the nodes of an array of pages and store it in an array of status. + */ +static int do_pages_stat(struct mm_struct *mm, unsigned long nr_pages, + const void __user * __user *pages, + int __user *status) +{ + unsigned long i; + int err; + down_read(&mm->mmap_sem); - for ( ; pm->node != MAX_NUMNODES; pm++) { + for (i = 0; i < nr_pages; i++) { + const void __user *p; + unsigned long addr; struct vm_area_struct *vma; struct page *page; - int err; err = -EFAULT; - vma = find_vma(mm, pm->addr); + if (get_user(p, pages+i)) + goto out; + addr = (unsigned long) p; + + vma = find_vma(mm, addr); if (!vma) goto set_status; - page = follow_page(vma, pm->addr, 0); + page = follow_page(vma, addr, 0); err = PTR_ERR(page); if (IS_ERR(page)) @@ -968,11 +1027,13 @@ static int do_pages_stat(struct mm_struct *mm, struct page_to_node *pm) err = page_to_nid(page); set_status: - pm->status = err; + put_user(err, status+i); } + err = 0; +out: up_read(&mm->mmap_sem); - return 0; + return err; } /* @@ -984,12 +1045,9 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, const int __user *nodes, int __user *status, int flags) { - int err = 0; - int i; struct task_struct *task; - nodemask_t task_nodes; struct mm_struct *mm; - struct page_to_node *pm = NULL; + int err; /* Check flags */ if (flags & ~(MPOL_MF_MOVE|MPOL_MF_MOVE_ALL)) @@ -1021,75 +1079,21 @@ asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, (current->uid != task->suid) && (current->uid != task->uid) && !capable(CAP_SYS_NICE)) { err = -EPERM; - goto out2; + goto out; } err = security_task_movememory(task); if (err) - goto out2; - - - task_nodes = cpuset_mems_allowed(task); - - /* Limit nr_pages so that the multiplication may not overflow */ - if (nr_pages >= ULONG_MAX / sizeof(struct page_to_node) - 1) { - err = -E2BIG; - goto out2; - } - - pm = vmalloc((nr_pages + 1) * sizeof(struct page_to_node)); - if (!pm) { - err = -ENOMEM; - goto out2; - } - - /* - * Get parameters from user space and initialize the pm - * array. Return various errors if the user did something wrong. - */ - for (i = 0; i < nr_pages; i++) { - const void __user *p; - - err = -EFAULT; - if (get_user(p, pages + i)) - goto out; - - pm[i].addr = (unsigned long)p; - if (nodes) { - int node; - - if (get_user(node, nodes + i)) - goto out; - - err = -ENODEV; - if (!node_state(node, N_HIGH_MEMORY)) - goto out; - - err = -EACCES; - if (!node_isset(node, task_nodes)) - goto out; + goto out; - pm[i].node = node; - } else - pm[i].node = 0; /* anything to not match MAX_NUMNODES */ + if (nodes) { + err = do_pages_move(mm, task, nr_pages, pages, nodes, status, + flags); + } else { + err = do_pages_stat(mm, nr_pages, pages, status); } - /* End marker */ - pm[nr_pages].node = MAX_NUMNODES; - - if (nodes) - err = do_move_pages(mm, pm, flags & MPOL_MF_MOVE_ALL); - else - err = do_pages_stat(mm, pm); - - if (err >= 0) - /* Return status information */ - for (i = 0; i < nr_pages; i++) - if (put_user(pm[i].status, status + i)) - err = -EFAULT; out: - vfree(pm); -out2: mmput(mm); return err; } @@ -8,10 +8,18 @@ #include <linux/capability.h> #include <linux/mman.h> #include <linux/mm.h> +#include <linux/swap.h> +#include <linux/swapops.h> +#include <linux/pagemap.h> #include <linux/mempolicy.h> #include <linux/syscalls.h> #include <linux/sched.h> #include <linux/module.h> +#include <linux/rmap.h> +#include <linux/mmzone.h> +#include <linux/hugetlb.h> + +#include "internal.h" int can_do_mlock(void) { @@ -23,17 +31,381 @@ int can_do_mlock(void) } EXPORT_SYMBOL(can_do_mlock); +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * Mlocked pages are marked with PageMlocked() flag for efficient testing + * in vmscan and, possibly, the fault path; and to support semi-accurate + * statistics. + * + * An mlocked page [PageMlocked(page)] is unevictable. As such, it will + * be placed on the LRU "unevictable" list, rather than the [in]active lists. + * The unevictable list is an LRU sibling list to the [in]active lists. + * PageUnevictable is set to indicate the unevictable state. + * + * When lazy mlocking via vmscan, it is important to ensure that the + * vma's VM_LOCKED status is not concurrently being modified, otherwise we + * may have mlocked a page that is being munlocked. So lazy mlock must take + * the mmap_sem for read, and verify that the vma really is locked + * (see mm/rmap.c). + */ + +/* + * LRU accounting for clear_page_mlock() + */ +void __clear_page_mlock(struct page *page) +{ + VM_BUG_ON(!PageLocked(page)); + + if (!page->mapping) { /* truncated ? */ + return; + } + + dec_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGCLEARED); + if (!isolate_lru_page(page)) { + putback_lru_page(page); + } else { + /* + * Page not on the LRU yet. Flush all pagevecs and retry. + */ + lru_add_drain_all(); + if (!isolate_lru_page(page)) + putback_lru_page(page); + else if (PageUnevictable(page)) + count_vm_event(UNEVICTABLE_PGSTRANDED); + + } +} + +/* + * Mark page as mlocked if not already. + * If page on LRU, isolate and putback to move to unevictable list. + */ +void mlock_vma_page(struct page *page) +{ + BUG_ON(!PageLocked(page)); + + if (!TestSetPageMlocked(page)) { + inc_zone_page_state(page, NR_MLOCK); + count_vm_event(UNEVICTABLE_PGMLOCKED); + if (!isolate_lru_page(page)) + putback_lru_page(page); + } +} + +/* + * called from munlock()/munmap() path with page supposedly on the LRU. + * + * Note: unlike mlock_vma_page(), we can't just clear the PageMlocked + * [in try_to_munlock()] and then attempt to isolate the page. We must + * isolate the page to keep others from messing with its unevictable + * and mlocked state while trying to munlock. However, we pre-clear the + * mlocked state anyway as we might lose the isolation race and we might + * not get another chance to clear PageMlocked. If we successfully + * isolate the page and try_to_munlock() detects other VM_LOCKED vmas + * mapping the page, it will restore the PageMlocked state, unless the page + * is mapped in a non-linear vma. So, we go ahead and SetPageMlocked(), + * perhaps redundantly. + * If we lose the isolation race, and the page is mapped by other VM_LOCKED + * vmas, we'll detect this in vmscan--via try_to_munlock() or try_to_unmap() + * either of which will restore the PageMlocked state by calling + * mlock_vma_page() above, if it can grab the vma's mmap sem. + */ +static void munlock_vma_page(struct page *page) +{ + BUG_ON(!PageLocked(page)); + + if (TestClearPageMlocked(page)) { + dec_zone_page_state(page, NR_MLOCK); + if (!isolate_lru_page(page)) { + int ret = try_to_munlock(page); + /* + * did try_to_unlock() succeed or punt? + */ + if (ret == SWAP_SUCCESS || ret == SWAP_AGAIN) + count_vm_event(UNEVICTABLE_PGMUNLOCKED); + + putback_lru_page(page); + } else { + /* + * We lost the race. let try_to_unmap() deal + * with it. At least we get the page state and + * mlock stats right. However, page is still on + * the noreclaim list. We'll fix that up when + * the page is eventually freed or we scan the + * noreclaim list. + */ + if (PageUnevictable(page)) + count_vm_event(UNEVICTABLE_PGSTRANDED); + else + count_vm_event(UNEVICTABLE_PGMUNLOCKED); + } + } +} + +/** + * __mlock_vma_pages_range() - mlock/munlock a range of pages in the vma. + * @vma: target vma + * @start: start address + * @end: end address + * @mlock: 0 indicate munlock, otherwise mlock. + * + * If @mlock == 0, unlock an mlocked range; + * else mlock the range of pages. This takes care of making the pages present , + * too. + * + * return 0 on success, negative error code on error. + * + * vma->vm_mm->mmap_sem must be held for at least read. + */ +static long __mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + int mlock) +{ + struct mm_struct *mm = vma->vm_mm; + unsigned long addr = start; + struct page *pages[16]; /* 16 gives a reasonable batch */ + int nr_pages = (end - start) / PAGE_SIZE; + int ret; + int gup_flags = 0; + + VM_BUG_ON(start & ~PAGE_MASK); + VM_BUG_ON(end & ~PAGE_MASK); + VM_BUG_ON(start < vma->vm_start); + VM_BUG_ON(end > vma->vm_end); + VM_BUG_ON((!rwsem_is_locked(&mm->mmap_sem)) && + (atomic_read(&mm->mm_users) != 0)); + + /* + * mlock: don't page populate if page has PROT_NONE permission. + * munlock: the pages always do munlock althrough + * its has PROT_NONE permission. + */ + if (!mlock) + gup_flags |= GUP_FLAGS_IGNORE_VMA_PERMISSIONS; + + if (vma->vm_flags & VM_WRITE) + gup_flags |= GUP_FLAGS_WRITE; + + lru_add_drain_all(); /* push cached pages to LRU */ + + while (nr_pages > 0) { + int i; + + cond_resched(); + + /* + * get_user_pages makes pages present if we are + * setting mlock. and this extra reference count will + * disable migration of this page. However, page may + * still be truncated out from under us. + */ + ret = __get_user_pages(current, mm, addr, + min_t(int, nr_pages, ARRAY_SIZE(pages)), + gup_flags, pages, NULL); + /* + * This can happen for, e.g., VM_NONLINEAR regions before + * a page has been allocated and mapped at a given offset, + * or for addresses that map beyond end of a file. + * We'll mlock the the pages if/when they get faulted in. + */ + if (ret < 0) + break; + if (ret == 0) { + /* + * We know the vma is there, so the only time + * we cannot get a single page should be an + * error (ret < 0) case. + */ + WARN_ON(1); + break; + } + + lru_add_drain(); /* push cached pages to LRU */ + + for (i = 0; i < ret; i++) { + struct page *page = pages[i]; + + lock_page(page); + /* + * Because we lock page here and migration is blocked + * by the elevated reference, we need only check for + * page truncation (file-cache only). + */ + if (page->mapping) { + if (mlock) + mlock_vma_page(page); + else + munlock_vma_page(page); + } + unlock_page(page); + put_page(page); /* ref from get_user_pages() */ + + /* + * here we assume that get_user_pages() has given us + * a list of virtually contiguous pages. + */ + addr += PAGE_SIZE; /* for next get_user_pages() */ + nr_pages--; + } + ret = 0; + } + + lru_add_drain_all(); /* to update stats */ + + return ret; /* count entire vma as locked_vm */ +} + +/* + * convert get_user_pages() return value to posix mlock() error + */ +static int __mlock_posix_error_return(long retval) +{ + if (retval == -EFAULT) + retval = -ENOMEM; + else if (retval == -ENOMEM) + retval = -EAGAIN; + return retval; +} + +#else /* CONFIG_UNEVICTABLE_LRU */ + +/* + * Just make pages present if VM_LOCKED. No-op if unlocking. + */ +static long __mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end, + int mlock) +{ + if (mlock && (vma->vm_flags & VM_LOCKED)) + return make_pages_present(start, end); + return 0; +} + +static inline int __mlock_posix_error_return(long retval) +{ + return 0; +} + +#endif /* CONFIG_UNEVICTABLE_LRU */ + +/** + * mlock_vma_pages_range() - mlock pages in specified vma range. + * @vma - the vma containing the specfied address range + * @start - starting address in @vma to mlock + * @end - end address [+1] in @vma to mlock + * + * For mmap()/mremap()/expansion of mlocked vma. + * + * return 0 on success for "normal" vmas. + * + * return number of pages [> 0] to be removed from locked_vm on success + * of "special" vmas. + * + * return negative error if vma spanning @start-@range disappears while + * mmap semaphore is dropped. Unlikely? + */ +long mlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + struct mm_struct *mm = vma->vm_mm; + int nr_pages = (end - start) / PAGE_SIZE; + BUG_ON(!(vma->vm_flags & VM_LOCKED)); + + /* + * filter unlockable vmas + */ + if (vma->vm_flags & (VM_IO | VM_PFNMAP)) + goto no_mlock; + + if (!((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current))) { + long error; + downgrade_write(&mm->mmap_sem); + + error = __mlock_vma_pages_range(vma, start, end, 1); + + up_read(&mm->mmap_sem); + /* vma can change or disappear */ + down_write(&mm->mmap_sem); + vma = find_vma(mm, start); + /* non-NULL vma must contain @start, but need to check @end */ + if (!vma || end > vma->vm_end) + return -ENOMEM; + + return 0; /* hide other errors from mmap(), et al */ + } + + /* + * User mapped kernel pages or huge pages: + * make these pages present to populate the ptes, but + * fall thru' to reset VM_LOCKED--no need to unlock, and + * return nr_pages so these don't get counted against task's + * locked limit. huge pages are already counted against + * locked vm limit. + */ + make_pages_present(start, end); + +no_mlock: + vma->vm_flags &= ~VM_LOCKED; /* and don't come back! */ + return nr_pages; /* error or pages NOT mlocked */ +} + + +/* + * munlock_vma_pages_range() - munlock all pages in the vma range.' + * @vma - vma containing range to be munlock()ed. + * @start - start address in @vma of the range + * @end - end of range in @vma. + * + * For mremap(), munmap() and exit(). + * + * Called with @vma VM_LOCKED. + * + * Returns with VM_LOCKED cleared. Callers must be prepared to + * deal with this. + * + * We don't save and restore VM_LOCKED here because pages are + * still on lru. In unmap path, pages might be scanned by reclaim + * and re-mlocked by try_to_{munlock|unmap} before we unmap and + * free them. This will result in freeing mlocked pages. + */ +void munlock_vma_pages_range(struct vm_area_struct *vma, + unsigned long start, unsigned long end) +{ + vma->vm_flags &= ~VM_LOCKED; + __mlock_vma_pages_range(vma, start, end, 0); +} + +/* + * mlock_fixup - handle mlock[all]/munlock[all] requests. + * + * Filters out "special" vmas -- VM_LOCKED never gets set for these, and + * munlock is a no-op. However, for some special vmas, we go ahead and + * populate the ptes via make_pages_present(). + * + * For vmas that pass the filters, merge/split as appropriate. + */ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, unsigned long end, unsigned int newflags) { - struct mm_struct * mm = vma->vm_mm; + struct mm_struct *mm = vma->vm_mm; pgoff_t pgoff; - int pages; + int nr_pages; int ret = 0; - - if (newflags == vma->vm_flags) { - *prev = vma; - goto out; + int lock = newflags & VM_LOCKED; + + if (newflags == vma->vm_flags || + (vma->vm_flags & (VM_IO | VM_PFNMAP))) + goto out; /* don't set VM_LOCKED, don't count */ + + if ((vma->vm_flags & (VM_DONTEXPAND | VM_RESERVED)) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current)) { + if (lock) + make_pages_present(start, end); + goto out; /* don't set VM_LOCKED, don't count */ } pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); @@ -44,8 +416,6 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, goto success; } - *prev = vma; - if (start != vma->vm_start) { ret = split_vma(mm, vma, start, 1); if (ret) @@ -60,24 +430,61 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, success: /* + * Keep track of amount of locked VM. + */ + nr_pages = (end - start) >> PAGE_SHIFT; + if (!lock) + nr_pages = -nr_pages; + mm->locked_vm += nr_pages; + + /* * vm_flags is protected by the mmap_sem held in write mode. * It's okay if try_to_unmap_one unmaps a page just after we - * set VM_LOCKED, make_pages_present below will bring it back. + * set VM_LOCKED, __mlock_vma_pages_range will bring it back. */ vma->vm_flags = newflags; - /* - * Keep track of amount of locked VM. - */ - pages = (end - start) >> PAGE_SHIFT; - if (newflags & VM_LOCKED) { - pages = -pages; - if (!(newflags & VM_IO)) - ret = make_pages_present(start, end); + if (lock) { + /* + * mmap_sem is currently held for write. Downgrade the write + * lock to a read lock so that other faults, mmap scans, ... + * while we fault in all pages. + */ + downgrade_write(&mm->mmap_sem); + + ret = __mlock_vma_pages_range(vma, start, end, 1); + + /* + * Need to reacquire mmap sem in write mode, as our callers + * expect this. We have no support for atomically upgrading + * a sem to write, so we need to check for ranges while sem + * is unlocked. + */ + up_read(&mm->mmap_sem); + /* vma can change or disappear */ + down_write(&mm->mmap_sem); + *prev = find_vma(mm, start); + /* non-NULL *prev must contain @start, but need to check @end */ + if (!(*prev) || end > (*prev)->vm_end) + ret = -ENOMEM; + else if (ret > 0) { + mm->locked_vm -= ret; + ret = 0; + } else + ret = __mlock_posix_error_return(ret); /* translate if needed */ + } else { + /* + * TODO: for unlocking, pages will already be resident, so + * we don't need to wait for allocations/reclaim/pagein, ... + * However, unlocking a very large region can still take a + * while. Should we downgrade the semaphore for both lock + * AND unlock ? + */ + __mlock_vma_pages_range(vma, start, end, 0); } - mm->locked_vm -= pages; out: + *prev = vma; return ret; } @@ -410,7 +410,7 @@ void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, rb_insert_color(&vma->vm_rb, &mm->mm_rb); } -static inline void __vma_link_file(struct vm_area_struct *vma) +static void __vma_link_file(struct vm_area_struct *vma) { struct file * file; @@ -662,8 +662,6 @@ again: remove_next = 1 + (end > next->vm_end); * If the vma has a ->close operation then the driver probably needs to release * per-vma resources, so we don't attempt to merge those. */ -#define VM_SPECIAL (VM_IO | VM_DONTEXPAND | VM_RESERVED | VM_PFNMAP) - static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags) { @@ -972,6 +970,7 @@ unsigned long do_mmap_pgoff(struct file * file, unsigned long addr, return -EPERM; vm_flags |= VM_LOCKED; } + /* mlock MCL_FUTURE? */ if (vm_flags & VM_LOCKED) { unsigned long locked, lock_limit; @@ -1139,10 +1138,12 @@ munmap_back: * The VM_SHARED test is necessary because shmem_zero_setup * will create the file object for a shared anonymous map below. */ - if (!file && !(vm_flags & VM_SHARED) && - vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, NULL, pgoff, NULL)) - goto out; + if (!file && !(vm_flags & VM_SHARED)) { + vma = vma_merge(mm, prev, addr, addr + len, vm_flags, + NULL, NULL, pgoff, NULL); + if (vma) + goto out; + } /* * Determine the object being mapped and call the appropriate @@ -1224,10 +1225,14 @@ out: mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; - make_pages_present(addr, addr + len); - } - if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) + /* + * makes pages present; downgrades, drops, reacquires mmap_sem + */ + long nr_pages = mlock_vma_pages_range(vma, addr, addr + len); + if (nr_pages < 0) + return nr_pages; /* vma gone! */ + mm->locked_vm += (len >> PAGE_SHIFT) - nr_pages; + } else if ((flags & MAP_POPULATE) && !(flags & MAP_NONBLOCK)) make_pages_present(addr, addr + len); return addr; @@ -1586,7 +1591,7 @@ static int acct_stack_growth(struct vm_area_struct * vma, unsigned long size, un * vma is the last one with address > vma->vm_end. Have to extend vma. */ #ifndef CONFIG_IA64 -static inline +static #endif int expand_upwards(struct vm_area_struct *vma, unsigned long address) { @@ -1636,7 +1641,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* * vma is the first one with address < vma->vm_start. Have to extend vma. */ -static inline int expand_downwards(struct vm_area_struct *vma, +static int expand_downwards(struct vm_area_struct *vma, unsigned long address) { int error; @@ -1698,10 +1703,12 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; - if (!prev || expand_stack(prev, addr)) + if (expand_stack(prev, addr)) return NULL; - if (prev->vm_flags & VM_LOCKED) - make_pages_present(addr, prev->vm_end); + if (prev->vm_flags & VM_LOCKED) { + if (mlock_vma_pages_range(prev, addr, prev->vm_end) < 0) + return NULL; /* vma gone! */ + } return prev; } #else @@ -1727,8 +1734,10 @@ find_extend_vma(struct mm_struct * mm, unsigned long addr) start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; - if (vma->vm_flags & VM_LOCKED) - make_pages_present(addr, start); + if (vma->vm_flags & VM_LOCKED) { + if (mlock_vma_pages_range(vma, addr, start) < 0) + return NULL; /* vma gone! */ + } return vma; } #endif @@ -1747,8 +1756,6 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) long nrpages = vma_pages(vma); mm->total_vm -= nrpages; - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm -= nrpages; vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); vma = remove_vma(vma); } while (vma); @@ -1914,6 +1921,20 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) vma = prev? prev->vm_next: mm->mmap; /* + * unlock any mlock()ed ranges before detaching vmas + */ + if (mm->locked_vm) { + struct vm_area_struct *tmp = vma; + while (tmp && tmp->vm_start < end) { + if (tmp->vm_flags & VM_LOCKED) { + mm->locked_vm -= vma_pages(tmp); + munlock_vma_pages_all(tmp); + } + tmp = tmp->vm_next; + } + } + + /* * Remove the vma's, and unmap the actual pages */ detach_vmas_to_be_unmapped(mm, vma, prev, end); @@ -2025,8 +2046,9 @@ unsigned long do_brk(unsigned long addr, unsigned long len) return -ENOMEM; /* Can we just expand an old private anonymous mapping? */ - if (vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL)) + vma = vma_merge(mm, prev, addr, addr + len, flags, + NULL, NULL, pgoff, NULL); + if (vma) goto out; /* @@ -2048,8 +2070,8 @@ unsigned long do_brk(unsigned long addr, unsigned long len) out: mm->total_vm += len >> PAGE_SHIFT; if (flags & VM_LOCKED) { - mm->locked_vm += len >> PAGE_SHIFT; - make_pages_present(addr, addr + len); + if (!mlock_vma_pages_range(vma, addr, addr + len)) + mm->locked_vm += (len >> PAGE_SHIFT); } return addr; } @@ -2060,7 +2082,7 @@ EXPORT_SYMBOL(do_brk); void exit_mmap(struct mm_struct *mm) { struct mmu_gather *tlb; - struct vm_area_struct *vma = mm->mmap; + struct vm_area_struct *vma; unsigned long nr_accounted = 0; unsigned long end; @@ -2068,6 +2090,15 @@ void exit_mmap(struct mm_struct *mm) arch_exit_mmap(mm); mmu_notifier_release(mm); + if (mm->locked_vm) { + vma = mm->mmap; + while (vma) { + if (vma->vm_flags & VM_LOCKED) + munlock_vma_pages_all(vma); + vma = vma->vm_next; + } + } + vma = mm->mmap; lru_add_drain(); flush_cache_mm(mm); tlb = tlb_gather_mmu(mm, 1); diff --git a/mm/mremap.c b/mm/mremap.c index 1a77439..58a2908 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -24,6 +24,8 @@ #include <asm/cacheflush.h> #include <asm/tlbflush.h> +#include "internal.h" + static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr) { pgd_t *pgd; @@ -238,8 +240,8 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (vm_flags & VM_LOCKED) { mm->locked_vm += new_len >> PAGE_SHIFT; if (new_len > old_len) - make_pages_present(new_addr + old_len, - new_addr + new_len); + mlock_vma_pages_range(new_vma, new_addr + old_len, + new_addr + new_len); } return new_addr; @@ -379,7 +381,7 @@ unsigned long do_mremap(unsigned long addr, vm_stat_account(mm, vma->vm_flags, vma->vm_file, pages); if (vma->vm_flags & VM_LOCKED) { mm->locked_vm += pages; - make_pages_present(addr + old_len, + mlock_vma_pages_range(vma, addr + old_len, addr + new_len); } ret = addr; @@ -34,6 +34,8 @@ #include <asm/tlb.h> #include <asm/tlbflush.h> +#include "internal.h" + void *high_memory; struct page *mem_map; unsigned long max_mapnr; @@ -128,20 +130,16 @@ unsigned int kobjsize(const void *objp) return PAGE_SIZE << compound_order(page); } -/* - * get a list of pages in an address range belonging to the specified process - * and indicate the VMA that covers each page - * - this is potentially dodgy as we may end incrementing the page count of a - * slab page or a secondary page from a compound page - * - don't permit access to VMAs that don't support it, such as I/O mappings - */ -int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, int len, int write, int force, - struct page **pages, struct vm_area_struct **vmas) +int __get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int flags, + struct page **pages, struct vm_area_struct **vmas) { struct vm_area_struct *vma; unsigned long vm_flags; int i; + int write = !!(flags & GUP_FLAGS_WRITE); + int force = !!(flags & GUP_FLAGS_FORCE); + int ignore = !!(flags & GUP_FLAGS_IGNORE_VMA_PERMISSIONS); /* calculate required read or write permissions. * - if 'force' is set, we only require the "MAY" flags. @@ -156,7 +154,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, /* protect what we can, including chardevs */ if (vma->vm_flags & (VM_IO | VM_PFNMAP) || - !(vm_flags & vma->vm_flags)) + (!ignore && !(vm_flags & vma->vm_flags))) goto finish_or_fault; if (pages) { @@ -174,6 +172,30 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, finish_or_fault: return i ? : -EFAULT; } + + +/* + * get a list of pages in an address range belonging to the specified process + * and indicate the VMA that covers each page + * - this is potentially dodgy as we may end incrementing the page count of a + * slab page or a secondary page from a compound page + * - don't permit access to VMAs that don't support it, such as I/O mappings + */ +int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, + unsigned long start, int len, int write, int force, + struct page **pages, struct vm_area_struct **vmas) +{ + int flags = 0; + + if (write) + flags |= GUP_FLAGS_WRITE; + if (force) + flags |= GUP_FLAGS_FORCE; + + return __get_user_pages(tsk, mm, + start, len, flags, + pages, vmas); +} EXPORT_SYMBOL(get_user_pages); DEFINE_RWLOCK(vmlist_lock); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 24de8b6..2970e35 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -7,7 +7,7 @@ * Contains functions related to writing back dirty pages at the * address_space level. * - * 10Apr2002 akpm@zip.com.au + * 10Apr2002 Andrew Morton * Initial version */ @@ -329,9 +329,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) struct zone *z = &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - x += zone_page_state(z, NR_FREE_PAGES) - + zone_page_state(z, NR_INACTIVE) - + zone_page_state(z, NR_ACTIVE); + x += zone_page_state(z, NR_FREE_PAGES) + zone_lru_pages(z); } /* * Make sure that the number of highmem pages is never larger @@ -355,9 +353,7 @@ unsigned long determine_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES) - + global_page_state(NR_INACTIVE) - + global_page_state(NR_ACTIVE); + x = global_page_state(NR_FREE_PAGES) + global_lru_pages(); if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); @@ -876,6 +872,7 @@ int write_cache_pages(struct address_space *mapping, pgoff_t end; /* Inclusive */ int scanned = 0; int range_whole = 0; + long nr_to_write = wbc->nr_to_write; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; @@ -939,7 +936,7 @@ retry: unlock_page(page); ret = 0; } - if (ret || (--(wbc->nr_to_write) <= 0)) + if (ret || (--nr_to_write <= 0)) done = 1; if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; @@ -958,11 +955,12 @@ retry: index = 0; goto retry; } - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = index; + if (!wbc->no_nrwrite_index_update) { + if (wbc->range_cyclic || (range_whole && nr_to_write > 0)) + mapping->writeback_index = index; + wbc->nr_to_write = nr_to_write; + } - if (wbc->range_cont) - wbc->range_start = index << PAGE_CACHE_SHIFT; return ret; } EXPORT_SYMBOL(write_cache_pages); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 27b8681..d0a240f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -44,7 +44,7 @@ #include <linux/backing-dev.h> #include <linux/fault-inject.h> #include <linux/page-isolation.h> -#include <linux/memcontrol.h> +#include <linux/page_cgroup.h> #include <linux/debugobjects.h> #include <asm/tlbflush.h> @@ -223,17 +223,12 @@ static inline int bad_range(struct zone *zone, struct page *page) static void bad_page(struct page *page) { - void *pc = page_get_page_cgroup(page); - printk(KERN_EMERG "Bad page state in process '%s'\n" KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n", current->comm, page, (int)(2*sizeof(unsigned long)), (unsigned long)page->flags, page->mapping, page_mapcount(page), page_count(page)); - if (pc) { - printk(KERN_EMERG "cgroup:%p\n", pc); - page_reset_bad_cgroup(page); - } + printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n" KERN_EMERG "Backtrace:\n"); dump_stack(); @@ -454,14 +449,16 @@ static inline void __free_one_page(struct page *page, static inline int free_pages_check(struct page *page) { + free_page_mlock(page); if (unlikely(page_mapcount(page) | (page->mapping != NULL) | - (page_get_page_cgroup(page) != NULL) | (page_count(page) != 0) | (page->flags & PAGE_FLAGS_CHECK_AT_FREE))) bad_page(page); if (PageDirty(page)) __ClearPageDirty(page); + if (PageSwapBacked(page)) + __ClearPageSwapBacked(page); /* * For now, we report if PG_reserved was found set, but do not * clear it, and do not free the page. But we shall soon need @@ -600,7 +597,6 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) { if (unlikely(page_mapcount(page) | (page->mapping != NULL) | - (page_get_page_cgroup(page) != NULL) | (page_count(page) != 0) | (page->flags & PAGE_FLAGS_CHECK_AT_PREP))) bad_page(page); @@ -614,7 +610,11 @@ static int prep_new_page(struct page *page, int order, gfp_t gfp_flags) page->flags &= ~(1 << PG_uptodate | 1 << PG_error | 1 << PG_reclaim | 1 << PG_referenced | 1 << PG_arch_1 | - 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk); + 1 << PG_owner_priv_1 | 1 << PG_mappedtodisk +#ifdef CONFIG_UNEVICTABLE_LRU + | 1 << PG_mlocked +#endif + ); set_page_private(page, 0); set_page_refcounted(page); @@ -1862,10 +1862,21 @@ void show_free_areas(void) } } - printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu unstable:%lu\n" + printk("Active_anon:%lu active_file:%lu inactive_anon:%lu\n" + " inactive_file:%lu" +//TODO: check/adjust line lengths +#ifdef CONFIG_UNEVICTABLE_LRU + " unevictable:%lu" +#endif + " dirty:%lu writeback:%lu unstable:%lu\n" " free:%lu slab:%lu mapped:%lu pagetables:%lu bounce:%lu\n", - global_page_state(NR_ACTIVE), - global_page_state(NR_INACTIVE), + global_page_state(NR_ACTIVE_ANON), + global_page_state(NR_ACTIVE_FILE), + global_page_state(NR_INACTIVE_ANON), + global_page_state(NR_INACTIVE_FILE), +#ifdef CONFIG_UNEVICTABLE_LRU + global_page_state(NR_UNEVICTABLE), +#endif global_page_state(NR_FILE_DIRTY), global_page_state(NR_WRITEBACK), global_page_state(NR_UNSTABLE_NFS), @@ -1888,8 +1899,13 @@ void show_free_areas(void) " min:%lukB" " low:%lukB" " high:%lukB" - " active:%lukB" - " inactive:%lukB" + " active_anon:%lukB" + " inactive_anon:%lukB" + " active_file:%lukB" + " inactive_file:%lukB" +#ifdef CONFIG_UNEVICTABLE_LRU + " unevictable:%lukB" +#endif " present:%lukB" " pages_scanned:%lu" " all_unreclaimable? %s" @@ -1899,8 +1915,13 @@ void show_free_areas(void) K(zone->pages_min), K(zone->pages_low), K(zone->pages_high), - K(zone_page_state(zone, NR_ACTIVE)), - K(zone_page_state(zone, NR_INACTIVE)), + K(zone_page_state(zone, NR_ACTIVE_ANON)), + K(zone_page_state(zone, NR_INACTIVE_ANON)), + K(zone_page_state(zone, NR_ACTIVE_FILE)), + K(zone_page_state(zone, NR_INACTIVE_FILE)), +#ifdef CONFIG_UNEVICTABLE_LRU + K(zone_page_state(zone, NR_UNEVICTABLE)), +#endif K(zone->present_pages), zone->pages_scanned, (zone_is_all_unreclaimable(zone) ? "yes" : "no") @@ -3410,10 +3431,12 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, pgdat->nr_zones = 0; init_waitqueue_head(&pgdat->kswapd_wait); pgdat->kswapd_max_order = 0; + pgdat_page_cgroup_init(pgdat); for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, memmap_pages; + enum lru_list l; size = zone_spanned_pages_in_node(nid, j, zones_size); realsize = size - zone_absent_pages_in_node(nid, j, @@ -3428,8 +3451,8 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, PAGE_ALIGN(size * sizeof(struct page)) >> PAGE_SHIFT; if (realsize >= memmap_pages) { realsize -= memmap_pages; - mminit_dprintk(MMINIT_TRACE, "memmap_init", - "%s zone: %lu pages used for memmap\n", + printk(KERN_DEBUG + " %s zone: %lu pages used for memmap\n", zone_names[j], memmap_pages); } else printk(KERN_WARNING @@ -3439,8 +3462,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, /* Account for reserved pages */ if (j == 0 && realsize > dma_reserve) { realsize -= dma_reserve; - mminit_dprintk(MMINIT_TRACE, "memmap_init", - "%s zone: %lu pages reserved\n", + printk(KERN_DEBUG " %s zone: %lu pages reserved\n", zone_names[0], dma_reserve); } @@ -3465,10 +3487,14 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, zone->prev_priority = DEF_PRIORITY; zone_pcp_init(zone); - INIT_LIST_HEAD(&zone->active_list); - INIT_LIST_HEAD(&zone->inactive_list); - zone->nr_scan_active = 0; - zone->nr_scan_inactive = 0; + for_each_lru(l) { + INIT_LIST_HEAD(&zone->lru[l].list); + zone->lru[l].nr_scan = 0; + } + zone->recent_rotated[0] = 0; + zone->recent_rotated[1] = 0; + zone->recent_scanned[0] = 0; + zone->recent_scanned[1] = 0; zap_zone_vm_stats(zone); zone->flags = 0; if (!size) @@ -3952,7 +3978,7 @@ static void check_for_regular_memory(pg_data_t *pgdat) void __init free_area_init_nodes(unsigned long *max_zone_pfn) { unsigned long nid; - enum zone_type i; + int i; /* Sort early_node_map as initialisation assumes it is sorted */ sort_node_map(); @@ -4210,7 +4236,7 @@ void setup_per_zone_pages_min(void) for_each_zone(zone) { u64 tmp; - spin_lock_irqsave(&zone->lru_lock, flags); + spin_lock_irqsave(&zone->lock, flags); tmp = (u64)pages_min * zone->present_pages; do_div(tmp, lowmem_pages); if (is_highmem(zone)) { @@ -4242,13 +4268,53 @@ void setup_per_zone_pages_min(void) zone->pages_low = zone->pages_min + (tmp >> 2); zone->pages_high = zone->pages_min + (tmp >> 1); setup_zone_migrate_reserve(zone); - spin_unlock_irqrestore(&zone->lru_lock, flags); + spin_unlock_irqrestore(&zone->lock, flags); } /* update totalreserve_pages */ calculate_totalreserve_pages(); } +/** + * setup_per_zone_inactive_ratio - called when min_free_kbytes changes. + * + * The inactive anon list should be small enough that the VM never has to + * do too much work, but large enough that each inactive page has a chance + * to be referenced again before it is swapped out. + * + * The inactive_anon ratio is the target ratio of ACTIVE_ANON to + * INACTIVE_ANON pages on this zone's LRU, maintained by the + * pageout code. A zone->inactive_ratio of 3 means 3:1 or 25% of + * the anonymous pages are kept on the inactive list. + * + * total target max + * memory ratio inactive anon + * ------------------------------------- + * 10MB 1 5MB + * 100MB 1 50MB + * 1GB 3 250MB + * 10GB 10 0.9GB + * 100GB 31 3GB + * 1TB 101 10GB + * 10TB 320 32GB + */ +void setup_per_zone_inactive_ratio(void) +{ + struct zone *zone; + + for_each_zone(zone) { + unsigned int gb, ratio; + + /* Zone size in gigabytes */ + gb = zone->present_pages >> (30 - PAGE_SHIFT); + ratio = int_sqrt(10 * gb); + if (!ratio) + ratio = 1; + + zone->inactive_ratio = ratio; + } +} + /* * Initialise min_free_kbytes. * @@ -4286,6 +4352,7 @@ static int __init init_per_zone_pages_min(void) min_free_kbytes = 65536; setup_per_zone_pages_min(); setup_per_zone_lowmem_reserve(); + setup_per_zone_inactive_ratio(); return 0; } module_init(init_per_zone_pages_min) diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c new file mode 100644 index 0000000..5d86550 --- /dev/null +++ b/mm/page_cgroup.c @@ -0,0 +1,237 @@ +#include <linux/mm.h> +#include <linux/mmzone.h> +#include <linux/bootmem.h> +#include <linux/bit_spinlock.h> +#include <linux/page_cgroup.h> +#include <linux/hash.h> +#include <linux/memory.h> + +static void __meminit +__init_page_cgroup(struct page_cgroup *pc, unsigned long pfn) +{ + pc->flags = 0; + pc->mem_cgroup = NULL; + pc->page = pfn_to_page(pfn); +} +static unsigned long total_usage; + +#if !defined(CONFIG_SPARSEMEM) + + +void __init pgdat_page_cgroup_init(struct pglist_data *pgdat) +{ + pgdat->node_page_cgroup = NULL; +} + +struct page_cgroup *lookup_page_cgroup(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + unsigned long offset; + struct page_cgroup *base; + + base = NODE_DATA(page_to_nid(page))->node_page_cgroup; + if (unlikely(!base)) + return NULL; + + offset = pfn - NODE_DATA(page_to_nid(page))->node_start_pfn; + return base + offset; +} + +static int __init alloc_node_page_cgroup(int nid) +{ + struct page_cgroup *base, *pc; + unsigned long table_size; + unsigned long start_pfn, nr_pages, index; + + start_pfn = NODE_DATA(nid)->node_start_pfn; + nr_pages = NODE_DATA(nid)->node_spanned_pages; + + table_size = sizeof(struct page_cgroup) * nr_pages; + + base = __alloc_bootmem_node_nopanic(NODE_DATA(nid), + table_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS)); + if (!base) + return -ENOMEM; + for (index = 0; index < nr_pages; index++) { + pc = base + index; + __init_page_cgroup(pc, start_pfn + index); + } + NODE_DATA(nid)->node_page_cgroup = base; + total_usage += table_size; + return 0; +} + +void __init page_cgroup_init(void) +{ + + int nid, fail; + + for_each_online_node(nid) { + fail = alloc_node_page_cgroup(nid); + if (fail) + goto fail; + } + printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); + printk(KERN_INFO "please try cgroup_disable=memory option if you" + " don't want\n"); + return; +fail: + printk(KERN_CRIT "allocation of page_cgroup was failed.\n"); + printk(KERN_CRIT "please try cgroup_disable=memory boot option\n"); + panic("Out of memory"); +} + +#else /* CONFIG_FLAT_NODE_MEM_MAP */ + +struct page_cgroup *lookup_page_cgroup(struct page *page) +{ + unsigned long pfn = page_to_pfn(page); + struct mem_section *section = __pfn_to_section(pfn); + + return section->page_cgroup + pfn; +} + +int __meminit init_section_page_cgroup(unsigned long pfn) +{ + struct mem_section *section; + struct page_cgroup *base, *pc; + unsigned long table_size; + int nid, index; + + section = __pfn_to_section(pfn); + + if (section->page_cgroup) + return 0; + + nid = page_to_nid(pfn_to_page(pfn)); + + table_size = sizeof(struct page_cgroup) * PAGES_PER_SECTION; + base = kmalloc_node(table_size, GFP_KERNEL, nid); + if (!base) + base = vmalloc_node(table_size, nid); + + if (!base) { + printk(KERN_ERR "page cgroup allocation failure\n"); + return -ENOMEM; + } + + for (index = 0; index < PAGES_PER_SECTION; index++) { + pc = base + index; + __init_page_cgroup(pc, pfn + index); + } + + section = __pfn_to_section(pfn); + section->page_cgroup = base - pfn; + total_usage += table_size; + return 0; +} +#ifdef CONFIG_MEMORY_HOTPLUG +void __free_page_cgroup(unsigned long pfn) +{ + struct mem_section *ms; + struct page_cgroup *base; + + ms = __pfn_to_section(pfn); + if (!ms || !ms->page_cgroup) + return; + base = ms->page_cgroup + pfn; + ms->page_cgroup = NULL; + if (is_vmalloc_addr(base)) + vfree(base); + else + kfree(base); +} + +int online_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, + int nid) +{ + unsigned long start, end, pfn; + int fail = 0; + + start = start_pfn & (PAGES_PER_SECTION - 1); + end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); + + for (pfn = start; !fail && pfn < end; pfn += PAGES_PER_SECTION) { + if (!pfn_present(pfn)) + continue; + fail = init_section_page_cgroup(pfn); + } + if (!fail) + return 0; + + /* rollback */ + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __free_page_cgroup(pfn); + + return -ENOMEM; +} + +int offline_page_cgroup(unsigned long start_pfn, + unsigned long nr_pages, int nid) +{ + unsigned long start, end, pfn; + + start = start_pfn & (PAGES_PER_SECTION - 1); + end = ALIGN(start_pfn + nr_pages, PAGES_PER_SECTION); + + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __free_page_cgroup(pfn); + return 0; + +} + +static int page_cgroup_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mn = arg; + int ret = 0; + switch (action) { + case MEM_GOING_ONLINE: + ret = online_page_cgroup(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; + case MEM_CANCEL_ONLINE: + case MEM_OFFLINE: + offline_page_cgroup(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; + case MEM_GOING_OFFLINE: + break; + case MEM_ONLINE: + case MEM_CANCEL_OFFLINE: + break; + } + ret = notifier_from_errno(ret); + return ret; +} + +#endif + +void __init page_cgroup_init(void) +{ + unsigned long pfn; + int fail = 0; + + for (pfn = 0; !fail && pfn < max_pfn; pfn += PAGES_PER_SECTION) { + if (!pfn_present(pfn)) + continue; + fail = init_section_page_cgroup(pfn); + } + if (fail) { + printk(KERN_CRIT "try cgroup_disable=memory boot option\n"); + panic("Out of memory"); + } else { + hotplug_memory_notifier(page_cgroup_callback, 0); + } + printk(KERN_INFO "allocated %ld bytes of page_cgroup\n", total_usage); + printk(KERN_INFO "please try cgroup_disable=memory option if you don't" + " want\n"); +} + +void __init pgdat_page_cgroup_init(struct pglist_data *pgdat) +{ + return; +} + +#endif diff --git a/mm/pdflush.c b/mm/pdflush.c index 0cbe0c6..a0a14c4 100644 --- a/mm/pdflush.c +++ b/mm/pdflush.c @@ -3,7 +3,7 @@ * * Copyright (C) 2002, Linus Torvalds. * - * 09Apr2002 akpm@zip.com.au + * 09Apr2002 Andrew Morton * Initial version * 29Feb2004 kaos@sgi.com * Move worker thread creation to kthread to avoid chewing diff --git a/mm/readahead.c b/mm/readahead.c index 77e8ddf..bec83c1 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -3,7 +3,7 @@ * * Copyright (C) 2002, Linus Torvalds * - * 09Apr2002 akpm@zip.com.au + * 09Apr2002 Andrew Morton * Initial version. */ @@ -229,7 +229,7 @@ int do_page_cache_readahead(struct address_space *mapping, struct file *filp, */ unsigned long max_sane_readahead(unsigned long nr) { - return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE) + return min(nr, (node_page_state(numa_node_id(), NR_INACTIVE_FILE) + node_page_state(numa_node_id(), NR_FREE_PAGES)) / 2); } @@ -53,9 +53,47 @@ #include <asm/tlbflush.h> -struct kmem_cache *anon_vma_cachep; +#include "internal.h" -/* This must be called under the mmap_sem. */ +static struct kmem_cache *anon_vma_cachep; + +static inline struct anon_vma *anon_vma_alloc(void) +{ + return kmem_cache_alloc(anon_vma_cachep, GFP_KERNEL); +} + +static inline void anon_vma_free(struct anon_vma *anon_vma) +{ + kmem_cache_free(anon_vma_cachep, anon_vma); +} + +/** + * anon_vma_prepare - attach an anon_vma to a memory region + * @vma: the memory region in question + * + * This makes sure the memory mapping described by 'vma' has + * an 'anon_vma' attached to it, so that we can associate the + * anonymous pages mapped into it with that anon_vma. + * + * The common case will be that we already have one, but if + * if not we either need to find an adjacent mapping that we + * can re-use the anon_vma from (very common when the only + * reason for splitting a vma has been mprotect()), or we + * allocate a new one. + * + * Anon-vma allocations are very subtle, because we may have + * optimistically looked up an anon_vma in page_lock_anon_vma() + * and that may actually touch the spinlock even in the newly + * allocated vma (it depends on RCU to make sure that the + * anon_vma isn't actually destroyed). + * + * As a result, we need to do proper anon_vma locking even + * for the new allocation. At the same time, we do not want + * to do any locking for the common case of already having + * an anon_vma. + * + * This must be called with the mmap_sem held for reading. + */ int anon_vma_prepare(struct vm_area_struct *vma) { struct anon_vma *anon_vma = vma->anon_vma; @@ -63,20 +101,17 @@ int anon_vma_prepare(struct vm_area_struct *vma) might_sleep(); if (unlikely(!anon_vma)) { struct mm_struct *mm = vma->vm_mm; - struct anon_vma *allocated, *locked; + struct anon_vma *allocated; anon_vma = find_mergeable_anon_vma(vma); - if (anon_vma) { - allocated = NULL; - locked = anon_vma; - spin_lock(&locked->lock); - } else { + allocated = NULL; + if (!anon_vma) { anon_vma = anon_vma_alloc(); if (unlikely(!anon_vma)) return -ENOMEM; allocated = anon_vma; - locked = NULL; } + spin_lock(&anon_vma->lock); /* page_table_lock to protect against threads */ spin_lock(&mm->page_table_lock); @@ -87,8 +122,7 @@ int anon_vma_prepare(struct vm_area_struct *vma) } spin_unlock(&mm->page_table_lock); - if (locked) - spin_unlock(&locked->lock); + spin_unlock(&anon_vma->lock); if (unlikely(allocated)) anon_vma_free(allocated); } @@ -157,7 +191,7 @@ void __init anon_vma_init(void) * Getting a lock on a stable anon_vma from a page off the LRU is * tricky: page_lock_anon_vma rely on RCU to guard against the races. */ -static struct anon_vma *page_lock_anon_vma(struct page *page) +struct anon_vma *page_lock_anon_vma(struct page *page) { struct anon_vma *anon_vma; unsigned long anon_mapping; @@ -177,7 +211,7 @@ out: return NULL; } -static void page_unlock_anon_vma(struct anon_vma *anon_vma) +void page_unlock_anon_vma(struct anon_vma *anon_vma) { spin_unlock(&anon_vma->lock); rcu_read_unlock(); @@ -268,6 +302,32 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm, return NULL; } +/** + * page_mapped_in_vma - check whether a page is really mapped in a VMA + * @page: the page to test + * @vma: the VMA to test + * + * Returns 1 if the page is mapped into the page tables of the VMA, 0 + * if the page is not mapped into the page tables of this VMA. Only + * valid for normal file or anonymous VMAs. + */ +static int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) +{ + unsigned long address; + pte_t *pte; + spinlock_t *ptl; + + address = vma_address(page, vma); + if (address == -EFAULT) /* out of vma range */ + return 0; + pte = page_check_address(page, vma->vm_mm, address, &ptl, 1); + if (!pte) /* the page is not in this mm */ + return 0; + pte_unmap_unlock(pte, ptl); + + return 1; +} + /* * Subfunctions of page_referenced: page_referenced_one called * repeatedly from either page_referenced_anon or page_referenced_file. @@ -289,10 +349,17 @@ static int page_referenced_one(struct page *page, if (!pte) goto out; + /* + * Don't want to elevate referenced for mlocked page that gets this far, + * in order that it progresses to try_to_unmap and is moved to the + * unevictable list. + */ if (vma->vm_flags & VM_LOCKED) { - referenced++; *mapcount = 1; /* break early from loop */ - } else if (ptep_clear_flush_young_notify(vma, address, pte)) + goto out_unmap; + } + + if (ptep_clear_flush_young_notify(vma, address, pte)) referenced++; /* Pretend the page is referenced if the task has the @@ -301,6 +368,7 @@ static int page_referenced_one(struct page *page, rwsem_is_locked(&mm->mmap_sem)) referenced++; +out_unmap: (*mapcount)--; pte_unmap_unlock(pte, ptl); out: @@ -390,11 +458,6 @@ static int page_referenced_file(struct page *page, */ if (mem_cont && !mm_match_cgroup(vma->vm_mm, mem_cont)) continue; - if ((vma->vm_flags & (VM_LOCKED|VM_MAYSHARE)) - == (VM_LOCKED|VM_MAYSHARE)) { - referenced++; - break; - } referenced += page_referenced_one(page, vma, &mapcount); if (!mapcount) break; @@ -674,8 +737,8 @@ void page_remove_rmap(struct page *page, struct vm_area_struct *vma) page_clear_dirty(page); set_page_dirty(page); } - - mem_cgroup_uncharge_page(page); + if (PageAnon(page)) + mem_cgroup_uncharge_page(page); __dec_zone_page_state(page, PageAnon(page) ? NR_ANON_PAGES : NR_FILE_MAPPED); /* @@ -717,11 +780,16 @@ static int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * If it's recently referenced (perhaps page_referenced * skipped over this mm) then we should reactivate it. */ - if (!migration && ((vma->vm_flags & VM_LOCKED) || - (ptep_clear_flush_young_notify(vma, address, pte)))) { - ret = SWAP_FAIL; - goto out_unmap; - } + if (!migration) { + if (vma->vm_flags & VM_LOCKED) { + ret = SWAP_MLOCK; + goto out_unmap; + } + if (ptep_clear_flush_young_notify(vma, address, pte)) { + ret = SWAP_FAIL; + goto out_unmap; + } + } /* Nuke the page table entry. */ flush_cache_page(vma, address, page_to_pfn(page)); @@ -802,12 +870,17 @@ out: * For very sparsely populated VMAs this is a little inefficient - chances are * there there won't be many ptes located within the scan cluster. In this case * maybe we could scan further - to the end of the pte page, perhaps. + * + * Mlocked pages: check VM_LOCKED under mmap_sem held for read, if we can + * acquire it without blocking. If vma locked, mlock the pages in the cluster, + * rather than unmapping them. If we encounter the "check_page" that vmscan is + * trying to unmap, return SWAP_MLOCK, else default SWAP_AGAIN. */ #define CLUSTER_SIZE min(32*PAGE_SIZE, PMD_SIZE) #define CLUSTER_MASK (~(CLUSTER_SIZE - 1)) -static void try_to_unmap_cluster(unsigned long cursor, - unsigned int *mapcount, struct vm_area_struct *vma) +static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, + struct vm_area_struct *vma, struct page *check_page) { struct mm_struct *mm = vma->vm_mm; pgd_t *pgd; @@ -819,6 +892,8 @@ static void try_to_unmap_cluster(unsigned long cursor, struct page *page; unsigned long address; unsigned long end; + int ret = SWAP_AGAIN; + int locked_vma = 0; address = (vma->vm_start + cursor) & CLUSTER_MASK; end = address + CLUSTER_SIZE; @@ -829,15 +904,26 @@ static void try_to_unmap_cluster(unsigned long cursor, pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) - return; + return ret; pud = pud_offset(pgd, address); if (!pud_present(*pud)) - return; + return ret; pmd = pmd_offset(pud, address); if (!pmd_present(*pmd)) - return; + return ret; + + /* + * MLOCK_PAGES => feature is configured. + * if we can acquire the mmap_sem for read, and vma is VM_LOCKED, + * keep the sem while scanning the cluster for mlocking pages. + */ + if (MLOCK_PAGES && down_read_trylock(&vma->vm_mm->mmap_sem)) { + locked_vma = (vma->vm_flags & VM_LOCKED); + if (!locked_vma) + up_read(&vma->vm_mm->mmap_sem); /* don't need it */ + } pte = pte_offset_map_lock(mm, pmd, address, &ptl); @@ -850,6 +936,13 @@ static void try_to_unmap_cluster(unsigned long cursor, page = vm_normal_page(vma, address, *pte); BUG_ON(!page || PageAnon(page)); + if (locked_vma) { + mlock_vma_page(page); /* no-op if already mlocked */ + if (page == check_page) + ret = SWAP_MLOCK; + continue; /* don't unmap */ + } + if (ptep_clear_flush_young_notify(vma, address, pte)) continue; @@ -871,39 +964,104 @@ static void try_to_unmap_cluster(unsigned long cursor, (*mapcount)--; } pte_unmap_unlock(pte - 1, ptl); + if (locked_vma) + up_read(&vma->vm_mm->mmap_sem); + return ret; } -static int try_to_unmap_anon(struct page *page, int migration) +/* + * common handling for pages mapped in VM_LOCKED vmas + */ +static int try_to_mlock_page(struct page *page, struct vm_area_struct *vma) +{ + int mlocked = 0; + + if (down_read_trylock(&vma->vm_mm->mmap_sem)) { + if (vma->vm_flags & VM_LOCKED) { + mlock_vma_page(page); + mlocked++; /* really mlocked the page */ + } + up_read(&vma->vm_mm->mmap_sem); + } + return mlocked; +} + +/** + * try_to_unmap_anon - unmap or unlock anonymous page using the object-based + * rmap method + * @page: the page to unmap/unlock + * @unlock: request for unlock rather than unmap [unlikely] + * @migration: unmapping for migration - ignored if @unlock + * + * Find all the mappings of a page using the mapping pointer and the vma chains + * contained in the anon_vma struct it points to. + * + * This function is only called from try_to_unmap/try_to_munlock for + * anonymous pages. + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * 'LOCKED. + */ +static int try_to_unmap_anon(struct page *page, int unlock, int migration) { struct anon_vma *anon_vma; struct vm_area_struct *vma; + unsigned int mlocked = 0; int ret = SWAP_AGAIN; + if (MLOCK_PAGES && unlikely(unlock)) + ret = SWAP_SUCCESS; /* default for try_to_munlock() */ + anon_vma = page_lock_anon_vma(page); if (!anon_vma) return ret; list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { - ret = try_to_unmap_one(page, vma, migration); - if (ret == SWAP_FAIL || !page_mapped(page)) - break; + if (MLOCK_PAGES && unlikely(unlock)) { + if (!((vma->vm_flags & VM_LOCKED) && + page_mapped_in_vma(page, vma))) + continue; /* must visit all unlocked vmas */ + ret = SWAP_MLOCK; /* saw at least one mlocked vma */ + } else { + ret = try_to_unmap_one(page, vma, migration); + if (ret == SWAP_FAIL || !page_mapped(page)) + break; + } + if (ret == SWAP_MLOCK) { + mlocked = try_to_mlock_page(page, vma); + if (mlocked) + break; /* stop if actually mlocked page */ + } } page_unlock_anon_vma(anon_vma); + + if (mlocked) + ret = SWAP_MLOCK; /* actually mlocked the page */ + else if (ret == SWAP_MLOCK) + ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ + return ret; } /** - * try_to_unmap_file - unmap file page using the object-based rmap method - * @page: the page to unmap - * @migration: migration flag + * try_to_unmap_file - unmap/unlock file page using the object-based rmap method + * @page: the page to unmap/unlock + * @unlock: request for unlock rather than unmap [unlikely] + * @migration: unmapping for migration - ignored if @unlock * * Find all the mappings of a page using the mapping pointer and the vma chains * contained in the address_space struct it points to. * - * This function is only called from try_to_unmap for object-based pages. + * This function is only called from try_to_unmap/try_to_munlock for + * object-based pages. + * When called from try_to_munlock(), the mmap_sem of the mm containing the vma + * where the page was found will be held for write. So, we won't recheck + * vm_flags for that VMA. That should be OK, because that vma shouldn't be + * 'LOCKED. */ -static int try_to_unmap_file(struct page *page, int migration) +static int try_to_unmap_file(struct page *page, int unlock, int migration) { struct address_space *mapping = page->mapping; pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); @@ -914,20 +1072,44 @@ static int try_to_unmap_file(struct page *page, int migration) unsigned long max_nl_cursor = 0; unsigned long max_nl_size = 0; unsigned int mapcount; + unsigned int mlocked = 0; + + if (MLOCK_PAGES && unlikely(unlock)) + ret = SWAP_SUCCESS; /* default for try_to_munlock() */ spin_lock(&mapping->i_mmap_lock); vma_prio_tree_foreach(vma, &iter, &mapping->i_mmap, pgoff, pgoff) { - ret = try_to_unmap_one(page, vma, migration); - if (ret == SWAP_FAIL || !page_mapped(page)) - goto out; + if (MLOCK_PAGES && unlikely(unlock)) { + if (!(vma->vm_flags & VM_LOCKED)) + continue; /* must visit all vmas */ + ret = SWAP_MLOCK; + } else { + ret = try_to_unmap_one(page, vma, migration); + if (ret == SWAP_FAIL || !page_mapped(page)) + goto out; + } + if (ret == SWAP_MLOCK) { + mlocked = try_to_mlock_page(page, vma); + if (mlocked) + break; /* stop if actually mlocked page */ + } } + if (mlocked) + goto out; + if (list_empty(&mapping->i_mmap_nonlinear)) goto out; list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if ((vma->vm_flags & VM_LOCKED) && !migration) + if (MLOCK_PAGES && unlikely(unlock)) { + if (!(vma->vm_flags & VM_LOCKED)) + continue; /* must visit all vmas */ + ret = SWAP_MLOCK; /* leave mlocked == 0 */ + goto out; /* no need to look further */ + } + if (!MLOCK_PAGES && !migration && (vma->vm_flags & VM_LOCKED)) continue; cursor = (unsigned long) vma->vm_private_data; if (cursor > max_nl_cursor) @@ -937,7 +1119,7 @@ static int try_to_unmap_file(struct page *page, int migration) max_nl_size = cursor; } - if (max_nl_size == 0) { /* any nonlinears locked or reserved */ + if (max_nl_size == 0) { /* all nonlinears locked or reserved ? */ ret = SWAP_FAIL; goto out; } @@ -961,12 +1143,16 @@ static int try_to_unmap_file(struct page *page, int migration) do { list_for_each_entry(vma, &mapping->i_mmap_nonlinear, shared.vm_set.list) { - if ((vma->vm_flags & VM_LOCKED) && !migration) + if (!MLOCK_PAGES && !migration && + (vma->vm_flags & VM_LOCKED)) continue; cursor = (unsigned long) vma->vm_private_data; while ( cursor < max_nl_cursor && cursor < vma->vm_end - vma->vm_start) { - try_to_unmap_cluster(cursor, &mapcount, vma); + ret = try_to_unmap_cluster(cursor, &mapcount, + vma, page); + if (ret == SWAP_MLOCK) + mlocked = 2; /* to return below */ cursor += CLUSTER_SIZE; vma->vm_private_data = (void *) cursor; if ((int)mapcount <= 0) @@ -987,6 +1173,10 @@ static int try_to_unmap_file(struct page *page, int migration) vma->vm_private_data = NULL; out: spin_unlock(&mapping->i_mmap_lock); + if (mlocked) + ret = SWAP_MLOCK; /* actually mlocked the page */ + else if (ret == SWAP_MLOCK) + ret = SWAP_AGAIN; /* saw VM_LOCKED vma */ return ret; } @@ -1002,6 +1192,7 @@ out: * SWAP_SUCCESS - we succeeded in removing all mappings * SWAP_AGAIN - we missed a mapping, try again later * SWAP_FAIL - the page is unswappable + * SWAP_MLOCK - page is mlocked. */ int try_to_unmap(struct page *page, int migration) { @@ -1010,12 +1201,36 @@ int try_to_unmap(struct page *page, int migration) BUG_ON(!PageLocked(page)); if (PageAnon(page)) - ret = try_to_unmap_anon(page, migration); + ret = try_to_unmap_anon(page, 0, migration); else - ret = try_to_unmap_file(page, migration); - - if (!page_mapped(page)) + ret = try_to_unmap_file(page, 0, migration); + if (ret != SWAP_MLOCK && !page_mapped(page)) ret = SWAP_SUCCESS; return ret; } +#ifdef CONFIG_UNEVICTABLE_LRU +/** + * try_to_munlock - try to munlock a page + * @page: the page to be munlocked + * + * Called from munlock code. Checks all of the VMAs mapping the page + * to make sure nobody else has this page mlocked. The page will be + * returned with PG_mlocked cleared if no other vmas have it mlocked. + * + * Return values are: + * + * SWAP_SUCCESS - no vma's holding page mlocked. + * SWAP_AGAIN - page mapped in mlocked vma -- couldn't acquire mmap sem + * SWAP_MLOCK - page is now mlocked. + */ +int try_to_munlock(struct page *page) +{ + VM_BUG_ON(!PageLocked(page) || PageLRU(page)); + + if (PageAnon(page)) + return try_to_unmap_anon(page, 1, 0); + else + return try_to_unmap_file(page, 1, 0); +} +#endif @@ -50,14 +50,12 @@ #include <linux/migrate.h> #include <linux/highmem.h> #include <linux/seq_file.h> +#include <linux/magic.h> #include <asm/uaccess.h> #include <asm/div64.h> #include <asm/pgtable.h> -/* This magic number is used in glibc for posix shared memory */ -#define TMPFS_MAGIC 0x01021994 - #define ENTRIES_PER_PAGE (PAGE_CACHE_SIZE/sizeof(unsigned long)) #define ENTRIES_PER_PAGEPAGE (ENTRIES_PER_PAGE*ENTRIES_PER_PAGE) #define BLOCKS_PER_PAGE (PAGE_CACHE_SIZE/512) @@ -201,7 +199,7 @@ static struct vm_operations_struct shmem_vm_ops; static struct backing_dev_info shmem_backing_dev_info __read_mostly = { .ra_pages = 0, /* No readahead */ - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, .unplug_io_fn = default_unplug_io_fn, }; @@ -1369,6 +1367,7 @@ repeat: error = -ENOMEM; goto failed; } + SetPageSwapBacked(filepage); /* Precharge page while we can wait, compensate after */ error = mem_cgroup_cache_charge(filepage, current->mm, @@ -1478,12 +1477,16 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) if (!user_shm_lock(inode->i_size, user)) goto out_nomem; info->flags |= VM_LOCKED; + mapping_set_unevictable(file->f_mapping); } if (!lock && (info->flags & VM_LOCKED) && user) { user_shm_unlock(inode->i_size, user); info->flags &= ~VM_LOCKED; + mapping_clear_unevictable(file->f_mapping); + scan_mapping_unevictable_pages(file->f_mapping); } retval = 0; + out_nomem: spin_unlock(&info->lock); return retval; @@ -2582,6 +2585,7 @@ put_memory: shmem_unacct_size(flags, size); return ERR_PTR(error); } +EXPORT_SYMBOL_GPL(shmem_file_setup); /** * shmem_zero_setup - setup a shared anonymous mapping @@ -31,11 +31,12 @@ #include <linux/backing-dev.h> #include <linux/memcontrol.h> +#include "internal.h" + /* How many pages do we try to swap or page in/out together? */ int page_cluster; -static DEFINE_PER_CPU(struct pagevec, lru_add_pvecs); -static DEFINE_PER_CPU(struct pagevec, lru_add_active_pvecs); +static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); /* @@ -116,8 +117,9 @@ static void pagevec_move_tail(struct pagevec *pvec) zone = pagezone; spin_lock(&zone->lru_lock); } - if (PageLRU(page) && !PageActive(page)) { - list_move_tail(&page->lru, &zone->inactive_list); + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + int lru = page_is_file_cache(page); + list_move_tail(&page->lru, &zone->lru[lru].list); pgmoved++; } } @@ -136,7 +138,7 @@ static void pagevec_move_tail(struct pagevec *pvec) void rotate_reclaimable_page(struct page *page) { if (!PageLocked(page) && !PageDirty(page) && !PageActive(page) && - PageLRU(page)) { + !PageUnevictable(page) && PageLRU(page)) { struct pagevec *pvec; unsigned long flags; @@ -157,12 +159,19 @@ void activate_page(struct page *page) struct zone *zone = page_zone(page); spin_lock_irq(&zone->lru_lock); - if (PageLRU(page) && !PageActive(page)) { - del_page_from_inactive_list(zone, page); + if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) { + int file = page_is_file_cache(page); + int lru = LRU_BASE + file; + del_page_from_lru_list(zone, page, lru); + SetPageActive(page); - add_page_to_active_list(zone, page); + lru += LRU_ACTIVE; + add_page_to_lru_list(zone, page, lru); __count_vm_event(PGACTIVATE); - mem_cgroup_move_lists(page, true); + mem_cgroup_move_lists(page, lru); + + zone->recent_rotated[!!file]++; + zone->recent_scanned[!!file]++; } spin_unlock_irq(&zone->lru_lock); } @@ -176,7 +185,8 @@ void activate_page(struct page *page) */ void mark_page_accessed(struct page *page) { - if (!PageActive(page) && PageReferenced(page) && PageLRU(page)) { + if (!PageActive(page) && !PageUnevictable(page) && + PageReferenced(page) && PageLRU(page)) { activate_page(page); ClearPageReferenced(page); } else if (!PageReferenced(page)) { @@ -186,28 +196,73 @@ void mark_page_accessed(struct page *page) EXPORT_SYMBOL(mark_page_accessed); -/** - * lru_cache_add: add a page to the page lists - * @page: the page to add - */ -void lru_cache_add(struct page *page) +void __lru_cache_add(struct page *page, enum lru_list lru) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs); + struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; page_cache_get(page); if (!pagevec_add(pvec, page)) - __pagevec_lru_add(pvec); + ____pagevec_lru_add(pvec, lru); put_cpu_var(lru_add_pvecs); } -void lru_cache_add_active(struct page *page) +/** + * lru_cache_add_lru - add a page to a page list + * @page: the page to be added to the LRU. + * @lru: the LRU list to which the page is added. + */ +void lru_cache_add_lru(struct page *page, enum lru_list lru) { - struct pagevec *pvec = &get_cpu_var(lru_add_active_pvecs); + if (PageActive(page)) { + VM_BUG_ON(PageUnevictable(page)); + ClearPageActive(page); + } else if (PageUnevictable(page)) { + VM_BUG_ON(PageActive(page)); + ClearPageUnevictable(page); + } - page_cache_get(page); - if (!pagevec_add(pvec, page)) - __pagevec_lru_add_active(pvec); - put_cpu_var(lru_add_active_pvecs); + VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page)); + __lru_cache_add(page, lru); +} + +/** + * add_page_to_unevictable_list - add a page to the unevictable list + * @page: the page to be added to the unevictable list + * + * Add page directly to its zone's unevictable list. To avoid races with + * tasks that might be making the page evictable, through eg. munlock, + * munmap or exit, while it's not on the lru, we want to add the page + * while it's locked or otherwise "invisible" to other tasks. This is + * difficult to do when using the pagevec cache, so bypass that. + */ +void add_page_to_unevictable_list(struct page *page) +{ + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + SetPageUnevictable(page); + SetPageLRU(page); + add_page_to_lru_list(zone, page, LRU_UNEVICTABLE); + spin_unlock_irq(&zone->lru_lock); +} + +/** + * lru_cache_add_active_or_unevictable + * @page: the page to be added to LRU + * @vma: vma in which page is mapped for determining reclaimability + * + * place @page on active or unevictable LRU list, depending on + * page_evictable(). Note that if the page is not evictable, + * it goes directly back onto it's zone's unevictable list. It does + * NOT use a per cpu pagevec. + */ +void lru_cache_add_active_or_unevictable(struct page *page, + struct vm_area_struct *vma) +{ + if (page_evictable(page, vma)) + lru_cache_add_lru(page, LRU_ACTIVE + page_is_file_cache(page)); + else + add_page_to_unevictable_list(page); } /* @@ -217,15 +272,15 @@ void lru_cache_add_active(struct page *page) */ static void drain_cpu_pagevecs(int cpu) { + struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); struct pagevec *pvec; + int lru; - pvec = &per_cpu(lru_add_pvecs, cpu); - if (pagevec_count(pvec)) - __pagevec_lru_add(pvec); - - pvec = &per_cpu(lru_add_active_pvecs, cpu); - if (pagevec_count(pvec)) - __pagevec_lru_add_active(pvec); + for_each_lru(lru) { + pvec = &pvecs[lru - LRU_BASE]; + if (pagevec_count(pvec)) + ____pagevec_lru_add(pvec, lru); + } pvec = &per_cpu(lru_rotate_pvecs, cpu); if (pagevec_count(pvec)) { @@ -244,7 +299,7 @@ void lru_add_drain(void) put_cpu(); } -#ifdef CONFIG_NUMA +#if defined(CONFIG_NUMA) || defined(CONFIG_UNEVICTABLE_LRU) static void lru_add_drain_per_cpu(struct work_struct *dummy) { lru_add_drain(); @@ -308,6 +363,7 @@ void release_pages(struct page **pages, int nr, int cold) if (PageLRU(page)) { struct zone *pagezone = page_zone(page); + if (pagezone != zone) { if (zone) spin_unlock_irqrestore(&zone->lru_lock, @@ -380,10 +436,11 @@ void __pagevec_release_nonlru(struct pagevec *pvec) * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ -void __pagevec_lru_add(struct pagevec *pvec) +void ____pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) { int i; struct zone *zone = NULL; + VM_BUG_ON(is_unevictable_lru(lru)); for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; @@ -395,9 +452,13 @@ void __pagevec_lru_add(struct pagevec *pvec) zone = pagezone; spin_lock_irq(&zone->lru_lock); } + VM_BUG_ON(PageActive(page)); + VM_BUG_ON(PageUnevictable(page)); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - add_page_to_inactive_list(zone, page); + if (is_active_lru(lru)) + SetPageActive(page); + add_page_to_lru_list(zone, page, lru); } if (zone) spin_unlock_irq(&zone->lru_lock); @@ -405,48 +466,45 @@ void __pagevec_lru_add(struct pagevec *pvec) pagevec_reinit(pvec); } -EXPORT_SYMBOL(__pagevec_lru_add); +EXPORT_SYMBOL(____pagevec_lru_add); -void __pagevec_lru_add_active(struct pagevec *pvec) +/* + * Try to drop buffers from the pages in a pagevec + */ +void pagevec_strip(struct pagevec *pvec) { int i; - struct zone *zone = NULL; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - struct zone *pagezone = page_zone(page); - if (pagezone != zone) { - if (zone) - spin_unlock_irq(&zone->lru_lock); - zone = pagezone; - spin_lock_irq(&zone->lru_lock); + if (PagePrivate(page) && trylock_page(page)) { + if (PagePrivate(page)) + try_to_release_page(page, 0); + unlock_page(page); } - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - VM_BUG_ON(PageActive(page)); - SetPageActive(page); - add_page_to_active_list(zone, page); } - if (zone) - spin_unlock_irq(&zone->lru_lock); - release_pages(pvec->pages, pvec->nr, pvec->cold); - pagevec_reinit(pvec); } -/* - * Try to drop buffers from the pages in a pagevec +/** + * pagevec_swap_free - try to free swap space from the pages in a pagevec + * @pvec: pagevec with swapcache pages to free the swap space of + * + * The caller needs to hold an extra reference to each page and + * not hold the page lock on the pages. This function uses a + * trylock on the page lock so it may not always free the swap + * space associated with a page. */ -void pagevec_strip(struct pagevec *pvec) +void pagevec_swap_free(struct pagevec *pvec) { int i; for (i = 0; i < pagevec_count(pvec); i++) { struct page *page = pvec->pages[i]; - if (PagePrivate(page) && trylock_page(page)) { - if (PagePrivate(page)) - try_to_release_page(page, 0); + if (PageSwapCache(page) && trylock_page(page)) { + if (PageSwapCache(page)) + remove_exclusive_swap_page_ref(page); unlock_page(page); } } diff --git a/mm/swap_state.c b/mm/swap_state.c index 797c383..3353c90 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -33,7 +33,7 @@ static const struct address_space_operations swap_aops = { }; static struct backing_dev_info swap_backing_dev_info = { - .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, + .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK | BDI_CAP_SWAP_BACKED, .unplug_io_fn = swap_unplug_io_fn, }; @@ -75,6 +75,7 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) BUG_ON(!PageLocked(page)); BUG_ON(PageSwapCache(page)); BUG_ON(PagePrivate(page)); + BUG_ON(!PageSwapBacked(page)); error = radix_tree_preload(gfp_mask); if (!error) { page_cache_get(page); @@ -302,17 +303,19 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * re-using the just freed swap entry for an existing page. * May fail (-ENOMEM) if radix-tree node allocation failed. */ - set_page_locked(new_page); + __set_page_locked(new_page); + SetPageSwapBacked(new_page); err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL); if (likely(!err)) { /* * Initiate read into locked page and return. */ - lru_cache_add_active(new_page); + lru_cache_add_anon(new_page); swap_readpage(NULL, new_page); return new_page; } - clear_page_locked(new_page); + ClearPageSwapBacked(new_page); + __clear_page_locked(new_page); swap_free(entry); } while (err != -ENOMEM); diff --git a/mm/swapfile.c b/mm/swapfile.c index 1e330f2..90cb67a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -344,7 +344,7 @@ int can_share_swap_page(struct page *page) * Work out if there are any other processes sharing this * swap cache page. Free it if you can. Return success. */ -int remove_exclusive_swap_page(struct page *page) +static int remove_exclusive_swap_page_count(struct page *page, int count) { int retval; struct swap_info_struct * p; @@ -357,7 +357,7 @@ int remove_exclusive_swap_page(struct page *page) return 0; if (PageWriteback(page)) return 0; - if (page_count(page) != 2) /* 2: us + cache */ + if (page_count(page) != count) /* us + cache + ptes */ return 0; entry.val = page_private(page); @@ -370,7 +370,7 @@ int remove_exclusive_swap_page(struct page *page) if (p->swap_map[swp_offset(entry)] == 1) { /* Recheck the page count with the swapcache lock held.. */ spin_lock_irq(&swapper_space.tree_lock); - if ((page_count(page) == 2) && !PageWriteback(page)) { + if ((page_count(page) == count) && !PageWriteback(page)) { __delete_from_swap_cache(page); SetPageDirty(page); retval = 1; @@ -388,6 +388,25 @@ int remove_exclusive_swap_page(struct page *page) } /* + * Most of the time the page should have two references: one for the + * process and one for the swap cache. + */ +int remove_exclusive_swap_page(struct page *page) +{ + return remove_exclusive_swap_page_count(page, 2); +} + +/* + * The pageout code holds an extra reference to the page. That raises + * the reference count to test for to 2 for a page that is only in the + * swap cache plus 1 for each process that maps the page. + */ +int remove_exclusive_swap_page_ref(struct page *page) +{ + return remove_exclusive_swap_page_count(page, 2 + page_mapcount(page)); +} + +/* * Free the swap entry like above, but also try to * free the page cache entry if it is the last user. */ @@ -403,7 +422,7 @@ void free_swap_and_cache(swp_entry_t entry) if (p) { if (swap_entry_free(p, swp_offset(entry)) == 1) { page = find_get_page(&swapper_space, entry.val); - if (page && unlikely(!trylock_page(page))) { + if (page && !trylock_page(page)) { page_cache_release(page); page = NULL; } diff --git a/mm/truncate.c b/mm/truncate.c index 6650c1d..1229211 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -3,7 +3,7 @@ * * Copyright (C) 2002, Linus Torvalds * - * 10Sep2002 akpm@zip.com.au + * 10Sep2002 Andrew Morton * Initial version. */ @@ -18,6 +18,7 @@ #include <linux/task_io_accounting_ops.h> #include <linux/buffer_head.h> /* grr. try_to_release_page, do_invalidatepage */ +#include "internal.h" /** @@ -103,6 +104,7 @@ truncate_complete_page(struct address_space *mapping, struct page *page) cancel_dirty_page(page, PAGE_CACHE_SIZE); + clear_page_mlock(page); remove_from_page_cache(page); ClearPageMappedToDisk(page); page_cache_release(page); /* pagecache ref */ @@ -127,6 +129,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) if (PagePrivate(page) && !try_to_release_page(page, 0)) return 0; + clear_page_mlock(page); ret = remove_mapping(mapping, page); return ret; @@ -352,6 +355,7 @@ invalidate_complete_page2(struct address_space *mapping, struct page *page) if (PageDirty(page)) goto failed; + clear_page_mlock(page); BUG_ON(PagePrivate(page)); __remove_from_page_cache(page); spin_unlock_irq(&mapping->tree_lock); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index f018d7e..0797589 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -8,6 +8,7 @@ * Numa awareness, Christoph Lameter, SGI, June 2005 */ +#include <linux/vmalloc.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/highmem.h> @@ -18,16 +19,17 @@ #include <linux/debugobjects.h> #include <linux/vmalloc.h> #include <linux/kallsyms.h> +#include <linux/list.h> +#include <linux/rbtree.h> +#include <linux/radix-tree.h> +#include <linux/rcupdate.h> +#include <asm/atomic.h> #include <asm/uaccess.h> #include <asm/tlbflush.h> -DEFINE_RWLOCK(vmlist_lock); -struct vm_struct *vmlist; - -static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, - int node, void *caller); +/*** Page table manipulation functions ***/ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) { @@ -40,8 +42,7 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end) } while (pte++, addr += PAGE_SIZE, addr != end); } -static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end) +static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end) { pmd_t *pmd; unsigned long next; @@ -55,8 +56,7 @@ static inline void vunmap_pmd_range(pud_t *pud, unsigned long addr, } while (pmd++, addr = next, addr != end); } -static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, - unsigned long end) +static void vunmap_pud_range(pgd_t *pgd, unsigned long addr, unsigned long end) { pud_t *pud; unsigned long next; @@ -70,12 +70,10 @@ static inline void vunmap_pud_range(pgd_t *pgd, unsigned long addr, } while (pud++, addr = next, addr != end); } -void unmap_kernel_range(unsigned long addr, unsigned long size) +static void vunmap_page_range(unsigned long addr, unsigned long end) { pgd_t *pgd; unsigned long next; - unsigned long start = addr; - unsigned long end = addr + size; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); @@ -86,35 +84,36 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) continue; vunmap_pud_range(pgd, addr, next); } while (pgd++, addr = next, addr != end); - flush_tlb_kernel_range(start, end); -} - -static void unmap_vm_area(struct vm_struct *area) -{ - unmap_kernel_range((unsigned long)area->addr, area->size); } static int vmap_pte_range(pmd_t *pmd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page ***pages) + unsigned long end, pgprot_t prot, struct page **pages, int *nr) { pte_t *pte; + /* + * nr is a running index into the array which helps higher level + * callers keep track of where we're up to. + */ + pte = pte_alloc_kernel(pmd, addr); if (!pte) return -ENOMEM; do { - struct page *page = **pages; - WARN_ON(!pte_none(*pte)); - if (!page) + struct page *page = pages[*nr]; + + if (WARN_ON(!pte_none(*pte))) + return -EBUSY; + if (WARN_ON(!page)) return -ENOMEM; set_pte_at(&init_mm, addr, pte, mk_pte(page, prot)); - (*pages)++; + (*nr)++; } while (pte++, addr += PAGE_SIZE, addr != end); return 0; } -static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, - unsigned long end, pgprot_t prot, struct page ***pages) +static int vmap_pmd_range(pud_t *pud, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr) { pmd_t *pmd; unsigned long next; @@ -124,14 +123,14 @@ static inline int vmap_pmd_range(pud_t *pud, unsigned long addr, return -ENOMEM; do { next = pmd_addr_end(addr, end); - if (vmap_pte_range(pmd, addr, next, prot, pages)) + if (vmap_pte_range(pmd, addr, next, prot, pages, nr)) return -ENOMEM; } while (pmd++, addr = next, addr != end); return 0; } -static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, - unsigned long end, pgprot_t prot, struct page ***pages) +static int vmap_pud_range(pgd_t *pgd, unsigned long addr, + unsigned long end, pgprot_t prot, struct page **pages, int *nr) { pud_t *pud; unsigned long next; @@ -141,32 +140,40 @@ static inline int vmap_pud_range(pgd_t *pgd, unsigned long addr, return -ENOMEM; do { next = pud_addr_end(addr, end); - if (vmap_pmd_range(pud, addr, next, prot, pages)) + if (vmap_pmd_range(pud, addr, next, prot, pages, nr)) return -ENOMEM; } while (pud++, addr = next, addr != end); return 0; } -int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) +/* + * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and + * will have pfns corresponding to the "pages" array. + * + * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N] + */ +static int vmap_page_range(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages) { pgd_t *pgd; unsigned long next; - unsigned long addr = (unsigned long) area->addr; - unsigned long end = addr + area->size - PAGE_SIZE; - int err; + int err = 0; + int nr = 0; BUG_ON(addr >= end); pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); - err = vmap_pud_range(pgd, addr, next, prot, pages); + err = vmap_pud_range(pgd, addr, next, prot, pages, &nr); if (err) break; } while (pgd++, addr = next, addr != end); - flush_cache_vmap((unsigned long) area->addr, end); - return err; + flush_cache_vmap(addr, end); + + if (unlikely(err)) + return err; + return nr; } -EXPORT_SYMBOL_GPL(map_vm_area); static inline int is_vmalloc_or_module_addr(const void *x) { @@ -184,16 +191,13 @@ static inline int is_vmalloc_or_module_addr(const void *x) } /* - * Map a vmalloc()-space virtual address to the physical page. + * Walk a vmap address to the struct page it maps. */ struct page *vmalloc_to_page(const void *vmalloc_addr) { unsigned long addr = (unsigned long) vmalloc_addr; struct page *page = NULL; pgd_t *pgd = pgd_offset_k(addr); - pud_t *pud; - pmd_t *pmd; - pte_t *ptep, pte; /* * XXX we might need to change this if we add VIRTUAL_BUG_ON for @@ -202,10 +206,12 @@ struct page *vmalloc_to_page(const void *vmalloc_addr) VIRTUAL_BUG_ON(!is_vmalloc_or_module_addr(vmalloc_addr)); if (!pgd_none(*pgd)) { - pud = pud_offset(pgd, addr); + pud_t *pud = pud_offset(pgd, addr); if (!pud_none(*pud)) { - pmd = pmd_offset(pud, addr); + pmd_t *pmd = pmd_offset(pud, addr); if (!pmd_none(*pmd)) { + pte_t *ptep, pte; + ptep = pte_offset_map(pmd, addr); pte = *ptep; if (pte_present(pte)) @@ -227,13 +233,751 @@ unsigned long vmalloc_to_pfn(const void *vmalloc_addr) } EXPORT_SYMBOL(vmalloc_to_pfn); -static struct vm_struct * -__get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, - unsigned long end, int node, gfp_t gfp_mask, void *caller) + +/*** Global kva allocator ***/ + +#define VM_LAZY_FREE 0x01 +#define VM_LAZY_FREEING 0x02 +#define VM_VM_AREA 0x04 + +struct vmap_area { + unsigned long va_start; + unsigned long va_end; + unsigned long flags; + struct rb_node rb_node; /* address sorted rbtree */ + struct list_head list; /* address sorted list */ + struct list_head purge_list; /* "lazy purge" list */ + void *private; + struct rcu_head rcu_head; +}; + +static DEFINE_SPINLOCK(vmap_area_lock); +static struct rb_root vmap_area_root = RB_ROOT; +static LIST_HEAD(vmap_area_list); + +static struct vmap_area *__find_vmap_area(unsigned long addr) { - struct vm_struct **p, *tmp, *area; - unsigned long align = 1; + struct rb_node *n = vmap_area_root.rb_node; + + while (n) { + struct vmap_area *va; + + va = rb_entry(n, struct vmap_area, rb_node); + if (addr < va->va_start) + n = n->rb_left; + else if (addr > va->va_start) + n = n->rb_right; + else + return va; + } + + return NULL; +} + +static void __insert_vmap_area(struct vmap_area *va) +{ + struct rb_node **p = &vmap_area_root.rb_node; + struct rb_node *parent = NULL; + struct rb_node *tmp; + + while (*p) { + struct vmap_area *tmp; + + parent = *p; + tmp = rb_entry(parent, struct vmap_area, rb_node); + if (va->va_start < tmp->va_end) + p = &(*p)->rb_left; + else if (va->va_end > tmp->va_start) + p = &(*p)->rb_right; + else + BUG(); + } + + rb_link_node(&va->rb_node, parent, p); + rb_insert_color(&va->rb_node, &vmap_area_root); + + /* address-sort this list so it is usable like the vmlist */ + tmp = rb_prev(&va->rb_node); + if (tmp) { + struct vmap_area *prev; + prev = rb_entry(tmp, struct vmap_area, rb_node); + list_add_rcu(&va->list, &prev->list); + } else + list_add_rcu(&va->list, &vmap_area_list); +} + +static void purge_vmap_area_lazy(void); + +/* + * Allocate a region of KVA of the specified size and alignment, within the + * vstart and vend. + */ +static struct vmap_area *alloc_vmap_area(unsigned long size, + unsigned long align, + unsigned long vstart, unsigned long vend, + int node, gfp_t gfp_mask) +{ + struct vmap_area *va; + struct rb_node *n; + unsigned long addr; + int purged = 0; + + BUG_ON(size & ~PAGE_MASK); + + addr = ALIGN(vstart, align); + + va = kmalloc_node(sizeof(struct vmap_area), + gfp_mask & GFP_RECLAIM_MASK, node); + if (unlikely(!va)) + return ERR_PTR(-ENOMEM); + +retry: + spin_lock(&vmap_area_lock); + /* XXX: could have a last_hole cache */ + n = vmap_area_root.rb_node; + if (n) { + struct vmap_area *first = NULL; + + do { + struct vmap_area *tmp; + tmp = rb_entry(n, struct vmap_area, rb_node); + if (tmp->va_end >= addr) { + if (!first && tmp->va_start < addr + size) + first = tmp; + n = n->rb_left; + } else { + first = tmp; + n = n->rb_right; + } + } while (n); + + if (!first) + goto found; + + if (first->va_end < addr) { + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct vmap_area, rb_node); + else + goto found; + } + + while (addr + size >= first->va_start && addr + size <= vend) { + addr = ALIGN(first->va_end + PAGE_SIZE, align); + + n = rb_next(&first->rb_node); + if (n) + first = rb_entry(n, struct vmap_area, rb_node); + else + goto found; + } + } +found: + if (addr + size > vend) { + spin_unlock(&vmap_area_lock); + if (!purged) { + purge_vmap_area_lazy(); + purged = 1; + goto retry; + } + if (printk_ratelimit()) + printk(KERN_WARNING "vmap allocation failed: " + "use vmalloc=<size> to increase size.\n"); + return ERR_PTR(-EBUSY); + } + + BUG_ON(addr & (align-1)); + + va->va_start = addr; + va->va_end = addr + size; + va->flags = 0; + __insert_vmap_area(va); + spin_unlock(&vmap_area_lock); + + return va; +} + +static void rcu_free_va(struct rcu_head *head) +{ + struct vmap_area *va = container_of(head, struct vmap_area, rcu_head); + + kfree(va); +} + +static void __free_vmap_area(struct vmap_area *va) +{ + BUG_ON(RB_EMPTY_NODE(&va->rb_node)); + rb_erase(&va->rb_node, &vmap_area_root); + RB_CLEAR_NODE(&va->rb_node); + list_del_rcu(&va->list); + + call_rcu(&va->rcu_head, rcu_free_va); +} + +/* + * Free a region of KVA allocated by alloc_vmap_area + */ +static void free_vmap_area(struct vmap_area *va) +{ + spin_lock(&vmap_area_lock); + __free_vmap_area(va); + spin_unlock(&vmap_area_lock); +} + +/* + * Clear the pagetable entries of a given vmap_area + */ +static void unmap_vmap_area(struct vmap_area *va) +{ + vunmap_page_range(va->va_start, va->va_end); +} + +/* + * lazy_max_pages is the maximum amount of virtual address space we gather up + * before attempting to purge with a TLB flush. + * + * There is a tradeoff here: a larger number will cover more kernel page tables + * and take slightly longer to purge, but it will linearly reduce the number of + * global TLB flushes that must be performed. It would seem natural to scale + * this number up linearly with the number of CPUs (because vmapping activity + * could also scale linearly with the number of CPUs), however it is likely + * that in practice, workloads might be constrained in other ways that mean + * vmap activity will not scale linearly with CPUs. Also, I want to be + * conservative and not introduce a big latency on huge systems, so go with + * a less aggressive log scale. It will still be an improvement over the old + * code, and it will be simple to change the scale factor if we find that it + * becomes a problem on bigger systems. + */ +static unsigned long lazy_max_pages(void) +{ + unsigned int log; + + log = fls(num_online_cpus()); + + return log * (32UL * 1024 * 1024 / PAGE_SIZE); +} + +static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); + +/* + * Purges all lazily-freed vmap areas. + * + * If sync is 0 then don't purge if there is already a purge in progress. + * If force_flush is 1, then flush kernel TLBs between *start and *end even + * if we found no lazy vmap areas to unmap (callers can use this to optimise + * their own TLB flushing). + * Returns with *start = min(*start, lowest purged address) + * *end = max(*end, highest purged address) + */ +static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, + int sync, int force_flush) +{ + static DEFINE_SPINLOCK(purge_lock); + LIST_HEAD(valist); + struct vmap_area *va; + int nr = 0; + + /* + * If sync is 0 but force_flush is 1, we'll go sync anyway but callers + * should not expect such behaviour. This just simplifies locking for + * the case that isn't actually used at the moment anyway. + */ + if (!sync && !force_flush) { + if (!spin_trylock(&purge_lock)) + return; + } else + spin_lock(&purge_lock); + + rcu_read_lock(); + list_for_each_entry_rcu(va, &vmap_area_list, list) { + if (va->flags & VM_LAZY_FREE) { + if (va->va_start < *start) + *start = va->va_start; + if (va->va_end > *end) + *end = va->va_end; + nr += (va->va_end - va->va_start) >> PAGE_SHIFT; + unmap_vmap_area(va); + list_add_tail(&va->purge_list, &valist); + va->flags |= VM_LAZY_FREEING; + va->flags &= ~VM_LAZY_FREE; + } + } + rcu_read_unlock(); + + if (nr) { + BUG_ON(nr > atomic_read(&vmap_lazy_nr)); + atomic_sub(nr, &vmap_lazy_nr); + } + + if (nr || force_flush) + flush_tlb_kernel_range(*start, *end); + + if (nr) { + spin_lock(&vmap_area_lock); + list_for_each_entry(va, &valist, purge_list) + __free_vmap_area(va); + spin_unlock(&vmap_area_lock); + } + spin_unlock(&purge_lock); +} + +/* + * Kick off a purge of the outstanding lazy areas. + */ +static void purge_vmap_area_lazy(void) +{ + unsigned long start = ULONG_MAX, end = 0; + + __purge_vmap_area_lazy(&start, &end, 0, 0); +} + +/* + * Free and unmap a vmap area + */ +static void free_unmap_vmap_area(struct vmap_area *va) +{ + va->flags |= VM_LAZY_FREE; + atomic_add((va->va_end - va->va_start) >> PAGE_SHIFT, &vmap_lazy_nr); + if (unlikely(atomic_read(&vmap_lazy_nr) > lazy_max_pages())) + purge_vmap_area_lazy(); +} + +static struct vmap_area *find_vmap_area(unsigned long addr) +{ + struct vmap_area *va; + + spin_lock(&vmap_area_lock); + va = __find_vmap_area(addr); + spin_unlock(&vmap_area_lock); + + return va; +} + +static void free_unmap_vmap_area_addr(unsigned long addr) +{ + struct vmap_area *va; + + va = find_vmap_area(addr); + BUG_ON(!va); + free_unmap_vmap_area(va); +} + + +/*** Per cpu kva allocator ***/ + +/* + * vmap space is limited especially on 32 bit architectures. Ensure there is + * room for at least 16 percpu vmap blocks per CPU. + */ +/* + * If we had a constant VMALLOC_START and VMALLOC_END, we'd like to be able + * to #define VMALLOC_SPACE (VMALLOC_END-VMALLOC_START). Guess + * instead (we just need a rough idea) + */ +#if BITS_PER_LONG == 32 +#define VMALLOC_SPACE (128UL*1024*1024) +#else +#define VMALLOC_SPACE (128UL*1024*1024*1024) +#endif + +#define VMALLOC_PAGES (VMALLOC_SPACE / PAGE_SIZE) +#define VMAP_MAX_ALLOC BITS_PER_LONG /* 256K with 4K pages */ +#define VMAP_BBMAP_BITS_MAX 1024 /* 4MB with 4K pages */ +#define VMAP_BBMAP_BITS_MIN (VMAP_MAX_ALLOC*2) +#define VMAP_MIN(x, y) ((x) < (y) ? (x) : (y)) /* can't use min() */ +#define VMAP_MAX(x, y) ((x) > (y) ? (x) : (y)) /* can't use max() */ +#define VMAP_BBMAP_BITS VMAP_MIN(VMAP_BBMAP_BITS_MAX, \ + VMAP_MAX(VMAP_BBMAP_BITS_MIN, \ + VMALLOC_PAGES / NR_CPUS / 16)) + +#define VMAP_BLOCK_SIZE (VMAP_BBMAP_BITS * PAGE_SIZE) + +struct vmap_block_queue { + spinlock_t lock; + struct list_head free; + struct list_head dirty; + unsigned int nr_dirty; +}; + +struct vmap_block { + spinlock_t lock; + struct vmap_area *va; + struct vmap_block_queue *vbq; + unsigned long free, dirty; + DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); + DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); + union { + struct { + struct list_head free_list; + struct list_head dirty_list; + }; + struct rcu_head rcu_head; + }; +}; + +/* Queue of free and dirty vmap blocks, for allocation and flushing purposes */ +static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); + +/* + * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block + * in the free path. Could get rid of this if we change the API to return a + * "cookie" from alloc, to be passed to free. But no big deal yet. + */ +static DEFINE_SPINLOCK(vmap_block_tree_lock); +static RADIX_TREE(vmap_block_tree, GFP_ATOMIC); + +/* + * We should probably have a fallback mechanism to allocate virtual memory + * out of partially filled vmap blocks. However vmap block sizing should be + * fairly reasonable according to the vmalloc size, so it shouldn't be a + * big problem. + */ + +static unsigned long addr_to_vb_idx(unsigned long addr) +{ + addr -= VMALLOC_START & ~(VMAP_BLOCK_SIZE-1); + addr /= VMAP_BLOCK_SIZE; + return addr; +} + +static struct vmap_block *new_vmap_block(gfp_t gfp_mask) +{ + struct vmap_block_queue *vbq; + struct vmap_block *vb; + struct vmap_area *va; + unsigned long vb_idx; + int node, err; + + node = numa_node_id(); + + vb = kmalloc_node(sizeof(struct vmap_block), + gfp_mask & GFP_RECLAIM_MASK, node); + if (unlikely(!vb)) + return ERR_PTR(-ENOMEM); + + va = alloc_vmap_area(VMAP_BLOCK_SIZE, VMAP_BLOCK_SIZE, + VMALLOC_START, VMALLOC_END, + node, gfp_mask); + if (unlikely(IS_ERR(va))) { + kfree(vb); + return ERR_PTR(PTR_ERR(va)); + } + + err = radix_tree_preload(gfp_mask); + if (unlikely(err)) { + kfree(vb); + free_vmap_area(va); + return ERR_PTR(err); + } + + spin_lock_init(&vb->lock); + vb->va = va; + vb->free = VMAP_BBMAP_BITS; + vb->dirty = 0; + bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); + bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); + INIT_LIST_HEAD(&vb->free_list); + INIT_LIST_HEAD(&vb->dirty_list); + + vb_idx = addr_to_vb_idx(va->va_start); + spin_lock(&vmap_block_tree_lock); + err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); + spin_unlock(&vmap_block_tree_lock); + BUG_ON(err); + radix_tree_preload_end(); + + vbq = &get_cpu_var(vmap_block_queue); + vb->vbq = vbq; + spin_lock(&vbq->lock); + list_add(&vb->free_list, &vbq->free); + spin_unlock(&vbq->lock); + put_cpu_var(vmap_cpu_blocks); + + return vb; +} + +static void rcu_free_vb(struct rcu_head *head) +{ + struct vmap_block *vb = container_of(head, struct vmap_block, rcu_head); + + kfree(vb); +} + +static void free_vmap_block(struct vmap_block *vb) +{ + struct vmap_block *tmp; + unsigned long vb_idx; + + spin_lock(&vb->vbq->lock); + if (!list_empty(&vb->free_list)) + list_del(&vb->free_list); + if (!list_empty(&vb->dirty_list)) + list_del(&vb->dirty_list); + spin_unlock(&vb->vbq->lock); + + vb_idx = addr_to_vb_idx(vb->va->va_start); + spin_lock(&vmap_block_tree_lock); + tmp = radix_tree_delete(&vmap_block_tree, vb_idx); + spin_unlock(&vmap_block_tree_lock); + BUG_ON(tmp != vb); + + free_unmap_vmap_area(vb->va); + call_rcu(&vb->rcu_head, rcu_free_vb); +} + +static void *vb_alloc(unsigned long size, gfp_t gfp_mask) +{ + struct vmap_block_queue *vbq; + struct vmap_block *vb; + unsigned long addr = 0; + unsigned int order; + + BUG_ON(size & ~PAGE_MASK); + BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); + order = get_order(size); + +again: + rcu_read_lock(); + vbq = &get_cpu_var(vmap_block_queue); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + int i; + + spin_lock(&vb->lock); + i = bitmap_find_free_region(vb->alloc_map, + VMAP_BBMAP_BITS, order); + + if (i >= 0) { + addr = vb->va->va_start + (i << PAGE_SHIFT); + BUG_ON(addr_to_vb_idx(addr) != + addr_to_vb_idx(vb->va->va_start)); + vb->free -= 1UL << order; + if (vb->free == 0) { + spin_lock(&vbq->lock); + list_del_init(&vb->free_list); + spin_unlock(&vbq->lock); + } + spin_unlock(&vb->lock); + break; + } + spin_unlock(&vb->lock); + } + put_cpu_var(vmap_cpu_blocks); + rcu_read_unlock(); + + if (!addr) { + vb = new_vmap_block(gfp_mask); + if (IS_ERR(vb)) + return vb; + goto again; + } + + return (void *)addr; +} + +static void vb_free(const void *addr, unsigned long size) +{ + unsigned long offset; + unsigned long vb_idx; + unsigned int order; + struct vmap_block *vb; + + BUG_ON(size & ~PAGE_MASK); + BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); + order = get_order(size); + + offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1); + + vb_idx = addr_to_vb_idx((unsigned long)addr); + rcu_read_lock(); + vb = radix_tree_lookup(&vmap_block_tree, vb_idx); + rcu_read_unlock(); + BUG_ON(!vb); + + spin_lock(&vb->lock); + bitmap_allocate_region(vb->dirty_map, offset >> PAGE_SHIFT, order); + if (!vb->dirty) { + spin_lock(&vb->vbq->lock); + list_add(&vb->dirty_list, &vb->vbq->dirty); + spin_unlock(&vb->vbq->lock); + } + vb->dirty += 1UL << order; + if (vb->dirty == VMAP_BBMAP_BITS) { + BUG_ON(vb->free || !list_empty(&vb->free_list)); + spin_unlock(&vb->lock); + free_vmap_block(vb); + } else + spin_unlock(&vb->lock); +} + +/** + * vm_unmap_aliases - unmap outstanding lazy aliases in the vmap layer + * + * The vmap/vmalloc layer lazily flushes kernel virtual mappings primarily + * to amortize TLB flushing overheads. What this means is that any page you + * have now, may, in a former life, have been mapped into kernel virtual + * address by the vmap layer and so there might be some CPUs with TLB entries + * still referencing that page (additional to the regular 1:1 kernel mapping). + * + * vm_unmap_aliases flushes all such lazy mappings. After it returns, we can + * be sure that none of the pages we have control over will have any aliases + * from the vmap layer. + */ +void vm_unmap_aliases(void) +{ + unsigned long start = ULONG_MAX, end = 0; + int cpu; + int flush = 0; + + for_each_possible_cpu(cpu) { + struct vmap_block_queue *vbq = &per_cpu(vmap_block_queue, cpu); + struct vmap_block *vb; + + rcu_read_lock(); + list_for_each_entry_rcu(vb, &vbq->free, free_list) { + int i; + + spin_lock(&vb->lock); + i = find_first_bit(vb->dirty_map, VMAP_BBMAP_BITS); + while (i < VMAP_BBMAP_BITS) { + unsigned long s, e; + int j; + j = find_next_zero_bit(vb->dirty_map, + VMAP_BBMAP_BITS, i); + + s = vb->va->va_start + (i << PAGE_SHIFT); + e = vb->va->va_start + (j << PAGE_SHIFT); + vunmap_page_range(s, e); + flush = 1; + + if (s < start) + start = s; + if (e > end) + end = e; + + i = j; + i = find_next_bit(vb->dirty_map, + VMAP_BBMAP_BITS, i); + } + spin_unlock(&vb->lock); + } + rcu_read_unlock(); + } + + __purge_vmap_area_lazy(&start, &end, 1, flush); +} +EXPORT_SYMBOL_GPL(vm_unmap_aliases); + +/** + * vm_unmap_ram - unmap linear kernel address space set up by vm_map_ram + * @mem: the pointer returned by vm_map_ram + * @count: the count passed to that vm_map_ram call (cannot unmap partial) + */ +void vm_unmap_ram(const void *mem, unsigned int count) +{ + unsigned long size = count << PAGE_SHIFT; + unsigned long addr = (unsigned long)mem; + + BUG_ON(!addr); + BUG_ON(addr < VMALLOC_START); + BUG_ON(addr > VMALLOC_END); + BUG_ON(addr & (PAGE_SIZE-1)); + + debug_check_no_locks_freed(mem, size); + + if (likely(count <= VMAP_MAX_ALLOC)) + vb_free(mem, size); + else + free_unmap_vmap_area_addr(addr); +} +EXPORT_SYMBOL(vm_unmap_ram); + +/** + * vm_map_ram - map pages linearly into kernel virtual address (vmalloc space) + * @pages: an array of pointers to the pages to be mapped + * @count: number of pages + * @node: prefer to allocate data structures on this node + * @prot: memory protection to use. PAGE_KERNEL for regular RAM + * @returns: a pointer to the address that has been mapped, or NULL on failure + */ +void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot) +{ + unsigned long size = count << PAGE_SHIFT; unsigned long addr; + void *mem; + + if (likely(count <= VMAP_MAX_ALLOC)) { + mem = vb_alloc(size, GFP_KERNEL); + if (IS_ERR(mem)) + return NULL; + addr = (unsigned long)mem; + } else { + struct vmap_area *va; + va = alloc_vmap_area(size, PAGE_SIZE, + VMALLOC_START, VMALLOC_END, node, GFP_KERNEL); + if (IS_ERR(va)) + return NULL; + + addr = va->va_start; + mem = (void *)addr; + } + if (vmap_page_range(addr, addr + size, prot, pages) < 0) { + vm_unmap_ram(mem, count); + return NULL; + } + return mem; +} +EXPORT_SYMBOL(vm_map_ram); + +void __init vmalloc_init(void) +{ + int i; + + for_each_possible_cpu(i) { + struct vmap_block_queue *vbq; + + vbq = &per_cpu(vmap_block_queue, i); + spin_lock_init(&vbq->lock); + INIT_LIST_HEAD(&vbq->free); + INIT_LIST_HEAD(&vbq->dirty); + vbq->nr_dirty = 0; + } +} + +void unmap_kernel_range(unsigned long addr, unsigned long size) +{ + unsigned long end = addr + size; + vunmap_page_range(addr, end); + flush_tlb_kernel_range(addr, end); +} + +int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page ***pages) +{ + unsigned long addr = (unsigned long)area->addr; + unsigned long end = addr + area->size - PAGE_SIZE; + int err; + + err = vmap_page_range(addr, end, prot, *pages); + if (err > 0) { + *pages += err; + err = 0; + } + + return err; +} +EXPORT_SYMBOL_GPL(map_vm_area); + +/*** Old vmalloc interfaces ***/ +DEFINE_RWLOCK(vmlist_lock); +struct vm_struct *vmlist; + +static struct vm_struct *__get_vm_area_node(unsigned long size, + unsigned long flags, unsigned long start, unsigned long end, + int node, gfp_t gfp_mask, void *caller) +{ + static struct vmap_area *va; + struct vm_struct *area; + struct vm_struct *tmp, **p; + unsigned long align = 1; BUG_ON(in_interrupt()); if (flags & VM_IOREMAP) { @@ -246,13 +990,12 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, align = 1ul << bit; } - addr = ALIGN(start, align); + size = PAGE_ALIGN(size); if (unlikely(!size)) return NULL; area = kmalloc_node(sizeof(*area), gfp_mask & GFP_RECLAIM_MASK, node); - if (unlikely(!area)) return NULL; @@ -261,48 +1004,32 @@ __get_vm_area_node(unsigned long size, unsigned long flags, unsigned long start, */ size += PAGE_SIZE; - write_lock(&vmlist_lock); - for (p = &vmlist; (tmp = *p) != NULL ;p = &tmp->next) { - if ((unsigned long)tmp->addr < addr) { - if((unsigned long)tmp->addr + tmp->size >= addr) - addr = ALIGN(tmp->size + - (unsigned long)tmp->addr, align); - continue; - } - if ((size + addr) < addr) - goto out; - if (size + addr <= (unsigned long)tmp->addr) - goto found; - addr = ALIGN(tmp->size + (unsigned long)tmp->addr, align); - if (addr > end - size) - goto out; + va = alloc_vmap_area(size, align, start, end, node, gfp_mask); + if (IS_ERR(va)) { + kfree(area); + return NULL; } - if ((size + addr) < addr) - goto out; - if (addr > end - size) - goto out; - -found: - area->next = *p; - *p = area; area->flags = flags; - area->addr = (void *)addr; + area->addr = (void *)va->va_start; area->size = size; area->pages = NULL; area->nr_pages = 0; area->phys_addr = 0; area->caller = caller; + va->private = area; + va->flags |= VM_VM_AREA; + + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) != NULL; p = &tmp->next) { + if (tmp->addr >= area->addr) + break; + } + area->next = *p; + *p = area; write_unlock(&vmlist_lock); return area; - -out: - write_unlock(&vmlist_lock); - kfree(area); - if (printk_ratelimit()) - printk(KERN_WARNING "allocation failed: out of vmalloc space - use vmalloc=<size> to increase size.\n"); - return NULL; } struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags, @@ -342,39 +1069,15 @@ struct vm_struct *get_vm_area_node(unsigned long size, unsigned long flags, gfp_mask, __builtin_return_address(0)); } -/* Caller must hold vmlist_lock */ -static struct vm_struct *__find_vm_area(const void *addr) +static struct vm_struct *find_vm_area(const void *addr) { - struct vm_struct *tmp; + struct vmap_area *va; - for (tmp = vmlist; tmp != NULL; tmp = tmp->next) { - if (tmp->addr == addr) - break; - } - - return tmp; -} - -/* Caller must hold vmlist_lock */ -static struct vm_struct *__remove_vm_area(const void *addr) -{ - struct vm_struct **p, *tmp; + va = find_vmap_area((unsigned long)addr); + if (va && va->flags & VM_VM_AREA) + return va->private; - for (p = &vmlist ; (tmp = *p) != NULL ;p = &tmp->next) { - if (tmp->addr == addr) - goto found; - } return NULL; - -found: - unmap_vm_area(tmp); - *p = tmp->next; - - /* - * Remove the guard page. - */ - tmp->size -= PAGE_SIZE; - return tmp; } /** @@ -387,11 +1090,24 @@ found: */ struct vm_struct *remove_vm_area(const void *addr) { - struct vm_struct *v; - write_lock(&vmlist_lock); - v = __remove_vm_area(addr); - write_unlock(&vmlist_lock); - return v; + struct vmap_area *va; + + va = find_vmap_area((unsigned long)addr); + if (va && va->flags & VM_VM_AREA) { + struct vm_struct *vm = va->private; + struct vm_struct *tmp, **p; + free_unmap_vmap_area(va); + vm->size -= PAGE_SIZE; + + write_lock(&vmlist_lock); + for (p = &vmlist; (tmp = *p) != vm; p = &tmp->next) + ; + *p = tmp->next; + write_unlock(&vmlist_lock); + + return vm; + } + return NULL; } static void __vunmap(const void *addr, int deallocate_pages) @@ -501,6 +1217,8 @@ void *vmap(struct page **pages, unsigned int count, } EXPORT_SYMBOL(vmap); +static void *__vmalloc_node(unsigned long size, gfp_t gfp_mask, pgprot_t prot, + int node, void *caller); static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, pgprot_t prot, int node, void *caller) { @@ -627,10 +1345,8 @@ void *vmalloc_user(unsigned long size) ret = __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO, PAGE_KERNEL); if (ret) { - write_lock(&vmlist_lock); - area = __find_vm_area(ret); + area = find_vm_area(ret); area->flags |= VM_USERMAP; - write_unlock(&vmlist_lock); } return ret; } @@ -710,10 +1426,8 @@ void *vmalloc_32_user(unsigned long size) ret = __vmalloc(size, GFP_VMALLOC32 | __GFP_ZERO, PAGE_KERNEL); if (ret) { - write_lock(&vmlist_lock); - area = __find_vm_area(ret); + area = find_vm_area(ret); area->flags |= VM_USERMAP; - write_unlock(&vmlist_lock); } return ret; } @@ -814,26 +1528,25 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, struct vm_struct *area; unsigned long uaddr = vma->vm_start; unsigned long usize = vma->vm_end - vma->vm_start; - int ret; if ((PAGE_SIZE-1) & (unsigned long)addr) return -EINVAL; - read_lock(&vmlist_lock); - area = __find_vm_area(addr); + area = find_vm_area(addr); if (!area) - goto out_einval_locked; + return -EINVAL; if (!(area->flags & VM_USERMAP)) - goto out_einval_locked; + return -EINVAL; if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) - goto out_einval_locked; - read_unlock(&vmlist_lock); + return -EINVAL; addr += pgoff << PAGE_SHIFT; do { struct page *page = vmalloc_to_page(addr); + int ret; + ret = vm_insert_page(vma, uaddr, page); if (ret) return ret; @@ -846,11 +1559,7 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, /* Prevent "things" like memory migration? VM_flags need a cleanup... */ vma->vm_flags |= VM_RESERVED; - return ret; - -out_einval_locked: - read_unlock(&vmlist_lock); - return -EINVAL; + return 0; } EXPORT_SYMBOL(remap_vmalloc_range); diff --git a/mm/vmscan.c b/mm/vmscan.c index 1ff1a58..3b58602 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -39,6 +39,7 @@ #include <linux/freezer.h> #include <linux/memcontrol.h> #include <linux/delayacct.h> +#include <linux/sysctl.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -78,7 +79,7 @@ struct scan_control { unsigned long (*isolate_pages)(unsigned long nr, struct list_head *dst, unsigned long *scanned, int order, int mode, struct zone *z, struct mem_cgroup *mem_cont, - int active); + int active, int file); }; #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru)) @@ -470,6 +471,85 @@ int remove_mapping(struct address_space *mapping, struct page *page) return 0; } +/** + * putback_lru_page - put previously isolated page onto appropriate LRU list + * @page: page to be put back to appropriate lru list + * + * Add previously isolated @page to appropriate LRU list. + * Page may still be unevictable for other reasons. + * + * lru_lock must not be held, interrupts must be enabled. + */ +#ifdef CONFIG_UNEVICTABLE_LRU +void putback_lru_page(struct page *page) +{ + int lru; + int active = !!TestClearPageActive(page); + int was_unevictable = PageUnevictable(page); + + VM_BUG_ON(PageLRU(page)); + +redo: + ClearPageUnevictable(page); + + if (page_evictable(page, NULL)) { + /* + * For evictable pages, we can use the cache. + * In event of a race, worst case is we end up with an + * unevictable page on [in]active list. + * We know how to handle that. + */ + lru = active + page_is_file_cache(page); + lru_cache_add_lru(page, lru); + } else { + /* + * Put unevictable pages directly on zone's unevictable + * list. + */ + lru = LRU_UNEVICTABLE; + add_page_to_unevictable_list(page); + } + mem_cgroup_move_lists(page, lru); + + /* + * page's status can change while we move it among lru. If an evictable + * page is on unevictable list, it never be freed. To avoid that, + * check after we added it to the list, again. + */ + if (lru == LRU_UNEVICTABLE && page_evictable(page, NULL)) { + if (!isolate_lru_page(page)) { + put_page(page); + goto redo; + } + /* This means someone else dropped this page from LRU + * So, it will be freed or putback to LRU again. There is + * nothing to do here. + */ + } + + if (was_unevictable && lru != LRU_UNEVICTABLE) + count_vm_event(UNEVICTABLE_PGRESCUED); + else if (!was_unevictable && lru == LRU_UNEVICTABLE) + count_vm_event(UNEVICTABLE_PGCULLED); + + put_page(page); /* drop ref from isolate */ +} + +#else /* CONFIG_UNEVICTABLE_LRU */ + +void putback_lru_page(struct page *page) +{ + int lru; + VM_BUG_ON(PageLRU(page)); + + lru = !!TestClearPageActive(page) + page_is_file_cache(page); + lru_cache_add_lru(page, lru); + mem_cgroup_move_lists(page, lru); + put_page(page); +} +#endif /* CONFIG_UNEVICTABLE_LRU */ + + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -503,6 +583,9 @@ static unsigned long shrink_page_list(struct list_head *page_list, sc->nr_scanned++; + if (unlikely(!page_evictable(page, NULL))) + goto cull_mlocked; + if (!sc->may_swap && page_mapped(page)) goto keep_locked; @@ -539,9 +622,19 @@ static unsigned long shrink_page_list(struct list_head *page_list, * Anonymous process memory has backing store? * Try to allocate it some swap space here. */ - if (PageAnon(page) && !PageSwapCache(page)) + if (PageAnon(page) && !PageSwapCache(page)) { + switch (try_to_munlock(page)) { + case SWAP_FAIL: /* shouldn't happen */ + case SWAP_AGAIN: + goto keep_locked; + case SWAP_MLOCK: + goto cull_mlocked; + case SWAP_SUCCESS: + ; /* fall thru'; add to swap cache */ + } if (!add_to_swap(page, GFP_ATOMIC)) goto activate_locked; + } #endif /* CONFIG_SWAP */ mapping = page_mapping(page); @@ -556,6 +649,8 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto activate_locked; case SWAP_AGAIN: goto keep_locked; + case SWAP_MLOCK: + goto cull_mlocked; case SWAP_SUCCESS: ; /* try to free the page below */ } @@ -602,7 +697,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, * possible for a page to have PageDirty set, but it is actually * clean (all its buffers are clean). This happens if the * buffers were written out directly, with submit_bh(). ext3 - * will do this, as well as the blockdev mapping. + * will do this, as well as the blockdev mapping. * try_to_release_page() will discover that cleanness and will * drop the buffers and mark the page clean - it can be freed. * @@ -637,7 +732,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (!mapping || !__remove_mapping(mapping, page)) goto keep_locked; - unlock_page(page); + /* + * At this point, we have no other references and there is + * no way to pick any more up (removed from LRU, removed + * from pagecache). Can use non-atomic bitops now (and + * we obviously don't have to worry about waking up a process + * waiting on the page lock, because there are no references. + */ + __clear_page_locked(page); free_it: nr_reclaimed++; if (!pagevec_add(&freed_pvec, page)) { @@ -646,14 +748,23 @@ free_it: } continue; +cull_mlocked: + unlock_page(page); + putback_lru_page(page); + continue; + activate_locked: + /* Not a candidate for swapping, so reclaim swap space. */ + if (PageSwapCache(page) && vm_swap_full()) + remove_exclusive_swap_page_ref(page); + VM_BUG_ON(PageActive(page)); SetPageActive(page); pgactivate++; keep_locked: unlock_page(page); keep: list_add(&page->lru, &ret_pages); - VM_BUG_ON(PageLRU(page)); + VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } list_splice(&ret_pages, page_list); if (pagevec_count(&freed_pvec)) @@ -677,7 +788,7 @@ keep: * * returns 0 on success, -ve errno on failure. */ -int __isolate_lru_page(struct page *page, int mode) +int __isolate_lru_page(struct page *page, int mode, int file) { int ret = -EINVAL; @@ -693,6 +804,17 @@ int __isolate_lru_page(struct page *page, int mode) if (mode != ISOLATE_BOTH && (!PageActive(page) != !mode)) return ret; + if (mode != ISOLATE_BOTH && (!page_is_file_cache(page) != !file)) + return ret; + + /* + * When this function is being called for lumpy reclaim, we + * initially look into all LRU pages, active, inactive and + * unevictable; only give shrink_page_list evictable pages. + */ + if (PageUnevictable(page)) + return ret; + ret = -EBUSY; if (likely(get_page_unless_zero(page))) { /* @@ -723,12 +845,13 @@ int __isolate_lru_page(struct page *page, int mode) * @scanned: The number of pages that were scanned. * @order: The caller's attempted allocation order * @mode: One of the LRU isolation modes + * @file: True [1] if isolating file [!anon] pages * * returns how many pages were moved onto *@dst. */ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, struct list_head *src, struct list_head *dst, - unsigned long *scanned, int order, int mode) + unsigned long *scanned, int order, int mode, int file) { unsigned long nr_taken = 0; unsigned long scan; @@ -745,7 +868,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, VM_BUG_ON(!PageLRU(page)); - switch (__isolate_lru_page(page, mode)) { + switch (__isolate_lru_page(page, mode, file)) { case 0: list_move(&page->lru, dst); nr_taken++; @@ -788,10 +911,11 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, break; cursor_page = pfn_to_page(pfn); + /* Check that we have not crossed a zone boundary. */ if (unlikely(page_zone_id(cursor_page) != zone_id)) continue; - switch (__isolate_lru_page(cursor_page, mode)) { + switch (__isolate_lru_page(cursor_page, mode, file)) { case 0: list_move(&cursor_page->lru, dst); nr_taken++; @@ -802,7 +926,7 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan, /* else it is being freed elsewhere */ list_move(&cursor_page->lru, src); default: - break; + break; /* ! on LRU or wrong list */ } } } @@ -816,40 +940,93 @@ static unsigned long isolate_pages_global(unsigned long nr, unsigned long *scanned, int order, int mode, struct zone *z, struct mem_cgroup *mem_cont, - int active) + int active, int file) { + int lru = LRU_BASE; if (active) - return isolate_lru_pages(nr, &z->active_list, dst, - scanned, order, mode); - else - return isolate_lru_pages(nr, &z->inactive_list, dst, - scanned, order, mode); + lru += LRU_ACTIVE; + if (file) + lru += LRU_FILE; + return isolate_lru_pages(nr, &z->lru[lru].list, dst, scanned, order, + mode, !!file); } /* * clear_active_flags() is a helper for shrink_active_list(), clearing * any active bits from the pages in the list. */ -static unsigned long clear_active_flags(struct list_head *page_list) +static unsigned long clear_active_flags(struct list_head *page_list, + unsigned int *count) { int nr_active = 0; + int lru; struct page *page; - list_for_each_entry(page, page_list, lru) + list_for_each_entry(page, page_list, lru) { + lru = page_is_file_cache(page); if (PageActive(page)) { + lru += LRU_ACTIVE; ClearPageActive(page); nr_active++; } + count[lru]++; + } return nr_active; } +/** + * isolate_lru_page - tries to isolate a page from its LRU list + * @page: page to isolate from its LRU list + * + * Isolates a @page from an LRU list, clears PageLRU and adjusts the + * vmstat statistic corresponding to whatever LRU list the page was on. + * + * Returns 0 if the page was removed from an LRU list. + * Returns -EBUSY if the page was not on an LRU list. + * + * The returned page will have PageLRU() cleared. If it was found on + * the active list, it will have PageActive set. If it was found on + * the unevictable list, it will have the PageUnevictable bit set. That flag + * may need to be cleared by the caller before letting the page go. + * + * The vmstat statistic corresponding to the list on which the page was + * found will be decremented. + * + * Restrictions: + * (1) Must be called with an elevated refcount on the page. This is a + * fundamentnal difference from isolate_lru_pages (which is called + * without a stable reference). + * (2) the lru_lock must not be held. + * (3) interrupts must be enabled. + */ +int isolate_lru_page(struct page *page) +{ + int ret = -EBUSY; + + if (PageLRU(page)) { + struct zone *zone = page_zone(page); + + spin_lock_irq(&zone->lru_lock); + if (PageLRU(page) && get_page_unless_zero(page)) { + int lru = page_lru(page); + ret = 0; + ClearPageLRU(page); + + del_page_from_lru_list(zone, page, lru); + } + spin_unlock_irq(&zone->lru_lock); + } + return ret; +} + /* * shrink_inactive_list() is a helper for shrink_zone(). It returns the number * of reclaimed pages */ static unsigned long shrink_inactive_list(unsigned long max_scan, - struct zone *zone, struct scan_control *sc) + struct zone *zone, struct scan_control *sc, + int priority, int file) { LIST_HEAD(page_list); struct pagevec pvec; @@ -866,20 +1043,43 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, unsigned long nr_scan; unsigned long nr_freed; unsigned long nr_active; + unsigned int count[NR_LRU_LISTS] = { 0, }; + int mode = ISOLATE_INACTIVE; + + /* + * If we need a large contiguous chunk of memory, or have + * trouble getting a small set of contiguous pages, we + * will reclaim both active and inactive pages. + * + * We use the same threshold as pageout congestion_wait below. + */ + if (sc->order > PAGE_ALLOC_COSTLY_ORDER) + mode = ISOLATE_BOTH; + else if (sc->order && priority < DEF_PRIORITY - 2) + mode = ISOLATE_BOTH; nr_taken = sc->isolate_pages(sc->swap_cluster_max, - &page_list, &nr_scan, sc->order, - (sc->order > PAGE_ALLOC_COSTLY_ORDER)? - ISOLATE_BOTH : ISOLATE_INACTIVE, - zone, sc->mem_cgroup, 0); - nr_active = clear_active_flags(&page_list); + &page_list, &nr_scan, sc->order, mode, + zone, sc->mem_cgroup, 0, file); + nr_active = clear_active_flags(&page_list, count); __count_vm_events(PGDEACTIVATE, nr_active); - __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); - __mod_zone_page_state(zone, NR_INACTIVE, - -(nr_taken - nr_active)); - if (scan_global_lru(sc)) + __mod_zone_page_state(zone, NR_ACTIVE_FILE, + -count[LRU_ACTIVE_FILE]); + __mod_zone_page_state(zone, NR_INACTIVE_FILE, + -count[LRU_INACTIVE_FILE]); + __mod_zone_page_state(zone, NR_ACTIVE_ANON, + -count[LRU_ACTIVE_ANON]); + __mod_zone_page_state(zone, NR_INACTIVE_ANON, + -count[LRU_INACTIVE_ANON]); + + if (scan_global_lru(sc)) { zone->pages_scanned += nr_scan; + zone->recent_scanned[0] += count[LRU_INACTIVE_ANON]; + zone->recent_scanned[0] += count[LRU_ACTIVE_ANON]; + zone->recent_scanned[1] += count[LRU_INACTIVE_FILE]; + zone->recent_scanned[1] += count[LRU_ACTIVE_FILE]; + } spin_unlock_irq(&zone->lru_lock); nr_scanned += nr_scan; @@ -899,7 +1099,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, * The attempt at page out may have made some * of the pages active, mark them inactive again. */ - nr_active = clear_active_flags(&page_list); + nr_active = clear_active_flags(&page_list, count); count_vm_events(PGDEACTIVATE, nr_active); nr_freed += shrink_page_list(&page_list, sc, @@ -924,14 +1124,24 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, * Put back any unfreeable pages. */ while (!list_empty(&page_list)) { + int lru; page = lru_to_page(&page_list); VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); list_del(&page->lru); - if (PageActive(page)) - add_page_to_active_list(zone, page); - else - add_page_to_inactive_list(zone, page); + if (unlikely(!page_evictable(page, NULL))) { + spin_unlock_irq(&zone->lru_lock); + putback_lru_page(page); + spin_lock_irq(&zone->lru_lock); + continue; + } + SetPageLRU(page); + lru = page_lru(page); + add_page_to_lru_list(zone, page, lru); + mem_cgroup_move_lists(page, lru); + if (PageActive(page) && scan_global_lru(sc)) { + int file = !!page_is_file_cache(page); + zone->recent_rotated[file]++; + } if (!pagevec_add(&pvec, page)) { spin_unlock_irq(&zone->lru_lock); __pagevec_release(&pvec); @@ -962,115 +1172,7 @@ static inline void note_zone_scanning_priority(struct zone *zone, int priority) static inline int zone_is_near_oom(struct zone *zone) { - return zone->pages_scanned >= (zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE))*3; -} - -/* - * Determine we should try to reclaim mapped pages. - * This is called only when sc->mem_cgroup is NULL. - */ -static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone, - int priority) -{ - long mapped_ratio; - long distress; - long swap_tendency; - long imbalance; - int reclaim_mapped = 0; - int prev_priority; - - if (scan_global_lru(sc) && zone_is_near_oom(zone)) - return 1; - /* - * `distress' is a measure of how much trouble we're having - * reclaiming pages. 0 -> no problems. 100 -> great trouble. - */ - if (scan_global_lru(sc)) - prev_priority = zone->prev_priority; - else - prev_priority = mem_cgroup_get_reclaim_priority(sc->mem_cgroup); - - distress = 100 >> min(prev_priority, priority); - - /* - * The point of this algorithm is to decide when to start - * reclaiming mapped memory instead of just pagecache. Work out - * how much memory - * is mapped. - */ - if (scan_global_lru(sc)) - mapped_ratio = ((global_page_state(NR_FILE_MAPPED) + - global_page_state(NR_ANON_PAGES)) * 100) / - vm_total_pages; - else - mapped_ratio = mem_cgroup_calc_mapped_ratio(sc->mem_cgroup); - - /* - * Now decide how much we really want to unmap some pages. The - * mapped ratio is downgraded - just because there's a lot of - * mapped memory doesn't necessarily mean that page reclaim - * isn't succeeding. - * - * The distress ratio is important - we don't want to start - * going oom. - * - * A 100% value of vm_swappiness overrides this algorithm - * altogether. - */ - swap_tendency = mapped_ratio / 2 + distress + sc->swappiness; - - /* - * If there's huge imbalance between active and inactive - * (think active 100 times larger than inactive) we should - * become more permissive, or the system will take too much - * cpu before it start swapping during memory pressure. - * Distress is about avoiding early-oom, this is about - * making swappiness graceful despite setting it to low - * values. - * - * Avoid div by zero with nr_inactive+1, and max resulting - * value is vm_total_pages. - */ - if (scan_global_lru(sc)) { - imbalance = zone_page_state(zone, NR_ACTIVE); - imbalance /= zone_page_state(zone, NR_INACTIVE) + 1; - } else - imbalance = mem_cgroup_reclaim_imbalance(sc->mem_cgroup); - - /* - * Reduce the effect of imbalance if swappiness is low, - * this means for a swappiness very low, the imbalance - * must be much higher than 100 for this logic to make - * the difference. - * - * Max temporary value is vm_total_pages*100. - */ - imbalance *= (vm_swappiness + 1); - imbalance /= 100; - - /* - * If not much of the ram is mapped, makes the imbalance - * less relevant, it's high priority we refill the inactive - * list with mapped pages only in presence of high ratio of - * mapped pages. - * - * Max temporary value is vm_total_pages*100. - */ - imbalance *= mapped_ratio; - imbalance /= 100; - - /* apply imbalance feedback to swap_tendency */ - swap_tendency += imbalance; - - /* - * Now use this metric to decide whether to start moving mapped - * memory onto the inactive list. - */ - if (swap_tendency >= 100) - reclaim_mapped = 1; - - return reclaim_mapped; + return zone->pages_scanned >= (zone_lru_pages(zone) * 3); } /* @@ -1093,53 +1195,71 @@ static int calc_reclaim_mapped(struct scan_control *sc, struct zone *zone, static void shrink_active_list(unsigned long nr_pages, struct zone *zone, - struct scan_control *sc, int priority) + struct scan_control *sc, int priority, int file) { unsigned long pgmoved; int pgdeactivate = 0; unsigned long pgscanned; LIST_HEAD(l_hold); /* The pages which were snipped off */ - LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */ - LIST_HEAD(l_active); /* Pages to go onto the active_list */ + LIST_HEAD(l_inactive); struct page *page; struct pagevec pvec; - int reclaim_mapped = 0; - - if (sc->may_swap) - reclaim_mapped = calc_reclaim_mapped(sc, zone, priority); + enum lru_list lru; lru_add_drain(); spin_lock_irq(&zone->lru_lock); pgmoved = sc->isolate_pages(nr_pages, &l_hold, &pgscanned, sc->order, ISOLATE_ACTIVE, zone, - sc->mem_cgroup, 1); + sc->mem_cgroup, 1, file); /* * zone->pages_scanned is used for detect zone's oom * mem_cgroup remembers nr_scan by itself. */ - if (scan_global_lru(sc)) + if (scan_global_lru(sc)) { zone->pages_scanned += pgscanned; + zone->recent_scanned[!!file] += pgmoved; + } - __mod_zone_page_state(zone, NR_ACTIVE, -pgmoved); + if (file) + __mod_zone_page_state(zone, NR_ACTIVE_FILE, -pgmoved); + else + __mod_zone_page_state(zone, NR_ACTIVE_ANON, -pgmoved); spin_unlock_irq(&zone->lru_lock); + pgmoved = 0; while (!list_empty(&l_hold)) { cond_resched(); page = lru_to_page(&l_hold); list_del(&page->lru); - if (page_mapped(page)) { - if (!reclaim_mapped || - (total_swap_pages == 0 && PageAnon(page)) || - page_referenced(page, 0, sc->mem_cgroup)) { - list_add(&page->lru, &l_active); - continue; - } + + if (unlikely(!page_evictable(page, NULL))) { + putback_lru_page(page); + continue; } + + /* page_referenced clears PageReferenced */ + if (page_mapping_inuse(page) && + page_referenced(page, 0, sc->mem_cgroup)) + pgmoved++; + list_add(&page->lru, &l_inactive); } + /* + * Count referenced pages from currently used mappings as + * rotated, even though they are moved to the inactive list. + * This helps balance scan pressure between file and anonymous + * pages in get_scan_ratio. + */ + zone->recent_rotated[!!file] += pgmoved; + + /* + * Move the pages to the [file or anon] inactive list. + */ pagevec_init(&pvec, 1); + pgmoved = 0; + lru = LRU_BASE + file * LRU_FILE; spin_lock_irq(&zone->lru_lock); while (!list_empty(&l_inactive)) { page = lru_to_page(&l_inactive); @@ -1149,11 +1269,11 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, VM_BUG_ON(!PageActive(page)); ClearPageActive(page); - list_move(&page->lru, &zone->inactive_list); - mem_cgroup_move_lists(page, false); + list_move(&page->lru, &zone->lru[lru].list); + mem_cgroup_move_lists(page, lru); pgmoved++; if (!pagevec_add(&pvec, page)) { - __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); spin_unlock_irq(&zone->lru_lock); pgdeactivate += pgmoved; pgmoved = 0; @@ -1163,104 +1283,189 @@ static void shrink_active_list(unsigned long nr_pages, struct zone *zone, spin_lock_irq(&zone->lru_lock); } } - __mod_zone_page_state(zone, NR_INACTIVE, pgmoved); + __mod_zone_page_state(zone, NR_LRU_BASE + lru, pgmoved); pgdeactivate += pgmoved; if (buffer_heads_over_limit) { spin_unlock_irq(&zone->lru_lock); pagevec_strip(&pvec); spin_lock_irq(&zone->lru_lock); } - - pgmoved = 0; - while (!list_empty(&l_active)) { - page = lru_to_page(&l_active); - prefetchw_prev_lru_page(page, &l_active, flags); - VM_BUG_ON(PageLRU(page)); - SetPageLRU(page); - VM_BUG_ON(!PageActive(page)); - - list_move(&page->lru, &zone->active_list); - mem_cgroup_move_lists(page, true); - pgmoved++; - if (!pagevec_add(&pvec, page)) { - __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); - pgmoved = 0; - spin_unlock_irq(&zone->lru_lock); - __pagevec_release(&pvec); - spin_lock_irq(&zone->lru_lock); - } - } - __mod_zone_page_state(zone, NR_ACTIVE, pgmoved); - __count_zone_vm_events(PGREFILL, zone, pgscanned); __count_vm_events(PGDEACTIVATE, pgdeactivate); spin_unlock_irq(&zone->lru_lock); + if (vm_swap_full()) + pagevec_swap_free(&pvec); pagevec_release(&pvec); } +static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, + struct zone *zone, struct scan_control *sc, int priority) +{ + int file = is_file_lru(lru); + + if (lru == LRU_ACTIVE_FILE) { + shrink_active_list(nr_to_scan, zone, sc, priority, file); + return 0; + } + + if (lru == LRU_ACTIVE_ANON && + (!scan_global_lru(sc) || inactive_anon_is_low(zone))) { + shrink_active_list(nr_to_scan, zone, sc, priority, file); + return 0; + } + return shrink_inactive_list(nr_to_scan, zone, sc, priority, file); +} + +/* + * Determine how aggressively the anon and file LRU lists should be + * scanned. The relative value of each set of LRU lists is determined + * by looking at the fraction of the pages scanned we did rotate back + * onto the active list instead of evict. + * + * percent[0] specifies how much pressure to put on ram/swap backed + * memory, while percent[1] determines pressure on the file LRUs. + */ +static void get_scan_ratio(struct zone *zone, struct scan_control *sc, + unsigned long *percent) +{ + unsigned long anon, file, free; + unsigned long anon_prio, file_prio; + unsigned long ap, fp; + + anon = zone_page_state(zone, NR_ACTIVE_ANON) + + zone_page_state(zone, NR_INACTIVE_ANON); + file = zone_page_state(zone, NR_ACTIVE_FILE) + + zone_page_state(zone, NR_INACTIVE_FILE); + free = zone_page_state(zone, NR_FREE_PAGES); + + /* If we have no swap space, do not bother scanning anon pages. */ + if (nr_swap_pages <= 0) { + percent[0] = 0; + percent[1] = 100; + return; + } + + /* If we have very few page cache pages, force-scan anon pages. */ + if (unlikely(file + free <= zone->pages_high)) { + percent[0] = 100; + percent[1] = 0; + return; + } + + /* + * OK, so we have swap space and a fair amount of page cache + * pages. We use the recently rotated / recently scanned + * ratios to determine how valuable each cache is. + * + * Because workloads change over time (and to avoid overflow) + * we keep these statistics as a floating average, which ends + * up weighing recent references more than old ones. + * + * anon in [0], file in [1] + */ + if (unlikely(zone->recent_scanned[0] > anon / 4)) { + spin_lock_irq(&zone->lru_lock); + zone->recent_scanned[0] /= 2; + zone->recent_rotated[0] /= 2; + spin_unlock_irq(&zone->lru_lock); + } + + if (unlikely(zone->recent_scanned[1] > file / 4)) { + spin_lock_irq(&zone->lru_lock); + zone->recent_scanned[1] /= 2; + zone->recent_rotated[1] /= 2; + spin_unlock_irq(&zone->lru_lock); + } + + /* + * With swappiness at 100, anonymous and file have the same priority. + * This scanning priority is essentially the inverse of IO cost. + */ + anon_prio = sc->swappiness; + file_prio = 200 - sc->swappiness; + + /* + * anon recent_rotated[0] + * %anon = 100 * ----------- / ----------------- * IO cost + * anon + file rotate_sum + */ + ap = (anon_prio + 1) * (zone->recent_scanned[0] + 1); + ap /= zone->recent_rotated[0] + 1; + + fp = (file_prio + 1) * (zone->recent_scanned[1] + 1); + fp /= zone->recent_rotated[1] + 1; + + /* Normalize to percentages */ + percent[0] = 100 * ap / (ap + fp + 1); + percent[1] = 100 - percent[0]; +} + + /* * This is a basic per-zone page freer. Used by both kswapd and direct reclaim. */ static unsigned long shrink_zone(int priority, struct zone *zone, struct scan_control *sc) { - unsigned long nr_active; - unsigned long nr_inactive; + unsigned long nr[NR_LRU_LISTS]; unsigned long nr_to_scan; unsigned long nr_reclaimed = 0; + unsigned long percent[2]; /* anon @ 0; file @ 1 */ + enum lru_list l; - if (scan_global_lru(sc)) { - /* - * Add one to nr_to_scan just to make sure that the kernel - * will slowly sift through the active list. - */ - zone->nr_scan_active += - (zone_page_state(zone, NR_ACTIVE) >> priority) + 1; - nr_active = zone->nr_scan_active; - zone->nr_scan_inactive += - (zone_page_state(zone, NR_INACTIVE) >> priority) + 1; - nr_inactive = zone->nr_scan_inactive; - if (nr_inactive >= sc->swap_cluster_max) - zone->nr_scan_inactive = 0; - else - nr_inactive = 0; - - if (nr_active >= sc->swap_cluster_max) - zone->nr_scan_active = 0; - else - nr_active = 0; - } else { - /* - * This reclaim occurs not because zone memory shortage but - * because memory controller hits its limit. - * Then, don't modify zone reclaim related data. - */ - nr_active = mem_cgroup_calc_reclaim_active(sc->mem_cgroup, - zone, priority); - - nr_inactive = mem_cgroup_calc_reclaim_inactive(sc->mem_cgroup, - zone, priority); - } + get_scan_ratio(zone, sc, percent); + for_each_evictable_lru(l) { + if (scan_global_lru(sc)) { + int file = is_file_lru(l); + int scan; - while (nr_active || nr_inactive) { - if (nr_active) { - nr_to_scan = min(nr_active, - (unsigned long)sc->swap_cluster_max); - nr_active -= nr_to_scan; - shrink_active_list(nr_to_scan, zone, sc, priority); + scan = zone_page_state(zone, NR_LRU_BASE + l); + if (priority) { + scan >>= priority; + scan = (scan * percent[file]) / 100; + } + zone->lru[l].nr_scan += scan; + nr[l] = zone->lru[l].nr_scan; + if (nr[l] >= sc->swap_cluster_max) + zone->lru[l].nr_scan = 0; + else + nr[l] = 0; + } else { + /* + * This reclaim occurs not because zone memory shortage + * but because memory controller hits its limit. + * Don't modify zone reclaim related data. + */ + nr[l] = mem_cgroup_calc_reclaim(sc->mem_cgroup, zone, + priority, l); } + } - if (nr_inactive) { - nr_to_scan = min(nr_inactive, + while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || + nr[LRU_INACTIVE_FILE]) { + for_each_evictable_lru(l) { + if (nr[l]) { + nr_to_scan = min(nr[l], (unsigned long)sc->swap_cluster_max); - nr_inactive -= nr_to_scan; - nr_reclaimed += shrink_inactive_list(nr_to_scan, zone, - sc); + nr[l] -= nr_to_scan; + + nr_reclaimed += shrink_list(l, nr_to_scan, + zone, sc, priority); + } } } + /* + * Even if we did not try to evict anon pages at all, we want to + * rebalance the anon lru active/inactive ratio. + */ + if (!scan_global_lru(sc) || inactive_anon_is_low(zone)) + shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + else if (!scan_global_lru(sc)) + shrink_active_list(SWAP_CLUSTER_MAX, zone, sc, priority, 0); + throttle_vm_writeout(sc->gfp_mask); return nr_reclaimed; } @@ -1321,7 +1526,7 @@ static unsigned long shrink_zones(int priority, struct zonelist *zonelist, return nr_reclaimed; } - + /* * This is the main entry point to direct page reclaim. * @@ -1364,8 +1569,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL)) continue; - lru_pages += zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE); + lru_pages += zone_lru_pages(zone); } } @@ -1555,6 +1759,14 @@ loop_again: priority != DEF_PRIORITY) continue; + /* + * Do some background aging of the anon list, to give + * pages a chance to be referenced before reclaiming. + */ + if (inactive_anon_is_low(zone)) + shrink_active_list(SWAP_CLUSTER_MAX, zone, + &sc, priority, 0); + if (!zone_watermark_ok(zone, order, zone->pages_high, 0, 0)) { end_zone = i; @@ -1567,8 +1779,7 @@ loop_again: for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - lru_pages += zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE); + lru_pages += zone_lru_pages(zone); } /* @@ -1612,8 +1823,7 @@ loop_again: if (zone_is_all_unreclaimable(zone)) continue; if (nr_slab == 0 && zone->pages_scanned >= - (zone_page_state(zone, NR_ACTIVE) - + zone_page_state(zone, NR_INACTIVE)) * 6) + (zone_lru_pages(zone) * 6)) zone_set_flag(zone, ZONE_ALL_UNRECLAIMABLE); /* @@ -1667,7 +1877,7 @@ out: /* * The background pageout daemon, started as a kernel thread - * from the init process. + * from the init process. * * This basically trickles out pages so that we have _some_ * free memory available even if there is no other activity @@ -1761,6 +1971,14 @@ void wakeup_kswapd(struct zone *zone, int order) wake_up_interruptible(&pgdat->kswapd_wait); } +unsigned long global_lru_pages(void) +{ + return global_page_state(NR_ACTIVE_ANON) + + global_page_state(NR_ACTIVE_FILE) + + global_page_state(NR_INACTIVE_ANON) + + global_page_state(NR_INACTIVE_FILE); +} + #ifdef CONFIG_PM /* * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages' pages @@ -1774,6 +1992,7 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, { struct zone *zone; unsigned long nr_to_scan, ret = 0; + enum lru_list l; for_each_zone(zone) { @@ -1783,38 +2002,31 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio, if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY) continue; - /* For pass = 0 we don't shrink the active list */ - if (pass > 0) { - zone->nr_scan_active += - (zone_page_state(zone, NR_ACTIVE) >> prio) + 1; - if (zone->nr_scan_active >= nr_pages || pass > 3) { - zone->nr_scan_active = 0; + for_each_evictable_lru(l) { + /* For pass = 0, we don't shrink the active list */ + if (pass == 0 && + (l == LRU_ACTIVE || l == LRU_ACTIVE_FILE)) + continue; + + zone->lru[l].nr_scan += + (zone_page_state(zone, NR_LRU_BASE + l) + >> prio) + 1; + if (zone->lru[l].nr_scan >= nr_pages || pass > 3) { + zone->lru[l].nr_scan = 0; nr_to_scan = min(nr_pages, - zone_page_state(zone, NR_ACTIVE)); - shrink_active_list(nr_to_scan, zone, sc, prio); + zone_page_state(zone, + NR_LRU_BASE + l)); + ret += shrink_list(l, nr_to_scan, zone, + sc, prio); + if (ret >= nr_pages) + return ret; } } - - zone->nr_scan_inactive += - (zone_page_state(zone, NR_INACTIVE) >> prio) + 1; - if (zone->nr_scan_inactive >= nr_pages || pass > 3) { - zone->nr_scan_inactive = 0; - nr_to_scan = min(nr_pages, - zone_page_state(zone, NR_INACTIVE)); - ret += shrink_inactive_list(nr_to_scan, zone, sc); - if (ret >= nr_pages) - return ret; - } } return ret; } -static unsigned long count_lru_pages(void) -{ - return global_page_state(NR_ACTIVE) + global_page_state(NR_INACTIVE); -} - /* * Try to free `nr_pages' of memory, system-wide, and return the number of * freed pages. @@ -1840,7 +2052,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) current->reclaim_state = &reclaim_state; - lru_pages = count_lru_pages(); + lru_pages = global_lru_pages(); nr_slab = global_page_state(NR_SLAB_RECLAIMABLE); /* If slab caches are huge, it's better to hit them first */ while (nr_slab >= lru_pages) { @@ -1883,7 +2095,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) reclaim_state.reclaimed_slab = 0; shrink_slab(sc.nr_scanned, sc.gfp_mask, - count_lru_pages()); + global_lru_pages()); ret += reclaim_state.reclaimed_slab; if (ret >= nr_pages) goto out; @@ -1900,7 +2112,7 @@ unsigned long shrink_all_memory(unsigned long nr_pages) if (!ret) { do { reclaim_state.reclaimed_slab = 0; - shrink_slab(nr_pages, sc.gfp_mask, count_lru_pages()); + shrink_slab(nr_pages, sc.gfp_mask, global_lru_pages()); ret += reclaim_state.reclaimed_slab; } while (ret < nr_pages && reclaim_state.reclaimed_slab > 0); } @@ -2128,3 +2340,285 @@ int zone_reclaim(struct zone *zone, gfp_t gfp_mask, unsigned int order) return ret; } #endif + +#ifdef CONFIG_UNEVICTABLE_LRU +/* + * page_evictable - test whether a page is evictable + * @page: the page to test + * @vma: the VMA in which the page is or will be mapped, may be NULL + * + * Test whether page is evictable--i.e., should be placed on active/inactive + * lists vs unevictable list. The vma argument is !NULL when called from the + * fault path to determine how to instantate a new page. + * + * Reasons page might not be evictable: + * (1) page's mapping marked unevictable + * (2) page is part of an mlocked VMA + * + */ +int page_evictable(struct page *page, struct vm_area_struct *vma) +{ + + if (mapping_unevictable(page_mapping(page))) + return 0; + + if (PageMlocked(page) || (vma && is_mlocked_vma(vma, page))) + return 0; + + return 1; +} + +static void show_page_path(struct page *page) +{ + char buf[256]; + if (page_is_file_cache(page)) { + struct address_space *mapping = page->mapping; + struct dentry *dentry; + pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT); + + spin_lock(&mapping->i_mmap_lock); + dentry = d_find_alias(mapping->host); + printk(KERN_INFO "rescued: %s %lu\n", + dentry_path(dentry, buf, 256), pgoff); + spin_unlock(&mapping->i_mmap_lock); + } else { +#if defined(CONFIG_MM_OWNER) && defined(CONFIG_MMU) + struct anon_vma *anon_vma; + struct vm_area_struct *vma; + + anon_vma = page_lock_anon_vma(page); + if (!anon_vma) + return; + + list_for_each_entry(vma, &anon_vma->head, anon_vma_node) { + printk(KERN_INFO "rescued: anon %s\n", + vma->vm_mm->owner->comm); + break; + } + page_unlock_anon_vma(anon_vma); +#endif + } +} + + +/** + * check_move_unevictable_page - check page for evictability and move to appropriate zone lru list + * @page: page to check evictability and move to appropriate lru list + * @zone: zone page is in + * + * Checks a page for evictability and moves the page to the appropriate + * zone lru list. + * + * Restrictions: zone->lru_lock must be held, page must be on LRU and must + * have PageUnevictable set. + */ +static void check_move_unevictable_page(struct page *page, struct zone *zone) +{ + VM_BUG_ON(PageActive(page)); + +retry: + ClearPageUnevictable(page); + if (page_evictable(page, NULL)) { + enum lru_list l = LRU_INACTIVE_ANON + page_is_file_cache(page); + + show_page_path(page); + + __dec_zone_state(zone, NR_UNEVICTABLE); + list_move(&page->lru, &zone->lru[l].list); + __inc_zone_state(zone, NR_INACTIVE_ANON + l); + __count_vm_event(UNEVICTABLE_PGRESCUED); + } else { + /* + * rotate unevictable list + */ + SetPageUnevictable(page); + list_move(&page->lru, &zone->lru[LRU_UNEVICTABLE].list); + if (page_evictable(page, NULL)) + goto retry; + } +} + +/** + * scan_mapping_unevictable_pages - scan an address space for evictable pages + * @mapping: struct address_space to scan for evictable pages + * + * Scan all pages in mapping. Check unevictable pages for + * evictability and move them to the appropriate zone lru list. + */ +void scan_mapping_unevictable_pages(struct address_space *mapping) +{ + pgoff_t next = 0; + pgoff_t end = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + struct zone *zone; + struct pagevec pvec; + + if (mapping->nrpages == 0) + return; + + pagevec_init(&pvec, 0); + while (next < end && + pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) { + int i; + int pg_scanned = 0; + + zone = NULL; + + for (i = 0; i < pagevec_count(&pvec); i++) { + struct page *page = pvec.pages[i]; + pgoff_t page_index = page->index; + struct zone *pagezone = page_zone(page); + + pg_scanned++; + if (page_index > next) + next = page_index; + next++; + + if (pagezone != zone) { + if (zone) + spin_unlock_irq(&zone->lru_lock); + zone = pagezone; + spin_lock_irq(&zone->lru_lock); + } + + if (PageLRU(page) && PageUnevictable(page)) + check_move_unevictable_page(page, zone); + } + if (zone) + spin_unlock_irq(&zone->lru_lock); + pagevec_release(&pvec); + + count_vm_events(UNEVICTABLE_PGSCANNED, pg_scanned); + } + +} + +/** + * scan_zone_unevictable_pages - check unevictable list for evictable pages + * @zone - zone of which to scan the unevictable list + * + * Scan @zone's unevictable LRU lists to check for pages that have become + * evictable. Move those that have to @zone's inactive list where they + * become candidates for reclaim, unless shrink_inactive_zone() decides + * to reactivate them. Pages that are still unevictable are rotated + * back onto @zone's unevictable list. + */ +#define SCAN_UNEVICTABLE_BATCH_SIZE 16UL /* arbitrary lock hold batch size */ +void scan_zone_unevictable_pages(struct zone *zone) +{ + struct list_head *l_unevictable = &zone->lru[LRU_UNEVICTABLE].list; + unsigned long scan; + unsigned long nr_to_scan = zone_page_state(zone, NR_UNEVICTABLE); + + while (nr_to_scan > 0) { + unsigned long batch_size = min(nr_to_scan, + SCAN_UNEVICTABLE_BATCH_SIZE); + + spin_lock_irq(&zone->lru_lock); + for (scan = 0; scan < batch_size; scan++) { + struct page *page = lru_to_page(l_unevictable); + + if (!trylock_page(page)) + continue; + + prefetchw_prev_lru_page(page, l_unevictable, flags); + + if (likely(PageLRU(page) && PageUnevictable(page))) + check_move_unevictable_page(page, zone); + + unlock_page(page); + } + spin_unlock_irq(&zone->lru_lock); + + nr_to_scan -= batch_size; + } +} + + +/** + * scan_all_zones_unevictable_pages - scan all unevictable lists for evictable pages + * + * A really big hammer: scan all zones' unevictable LRU lists to check for + * pages that have become evictable. Move those back to the zones' + * inactive list where they become candidates for reclaim. + * This occurs when, e.g., we have unswappable pages on the unevictable lists, + * and we add swap to the system. As such, it runs in the context of a task + * that has possibly/probably made some previously unevictable pages + * evictable. + */ +void scan_all_zones_unevictable_pages(void) +{ + struct zone *zone; + + for_each_zone(zone) { + scan_zone_unevictable_pages(zone); + } +} + +/* + * scan_unevictable_pages [vm] sysctl handler. On demand re-scan of + * all nodes' unevictable lists for evictable pages + */ +unsigned long scan_unevictable_pages; + +int scan_unevictable_handler(struct ctl_table *table, int write, + struct file *file, void __user *buffer, + size_t *length, loff_t *ppos) +{ + proc_doulongvec_minmax(table, write, file, buffer, length, ppos); + + if (write && *(unsigned long *)table->data) + scan_all_zones_unevictable_pages(); + + scan_unevictable_pages = 0; + return 0; +} + +/* + * per node 'scan_unevictable_pages' attribute. On demand re-scan of + * a specified node's per zone unevictable lists for evictable pages. + */ + +static ssize_t read_scan_unevictable_node(struct sys_device *dev, + struct sysdev_attribute *attr, + char *buf) +{ + return sprintf(buf, "0\n"); /* always zero; should fit... */ +} + +static ssize_t write_scan_unevictable_node(struct sys_device *dev, + struct sysdev_attribute *attr, + const char *buf, size_t count) +{ + struct zone *node_zones = NODE_DATA(dev->id)->node_zones; + struct zone *zone; + unsigned long res; + unsigned long req = strict_strtoul(buf, 10, &res); + + if (!req) + return 1; /* zero is no-op */ + + for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) { + if (!populated_zone(zone)) + continue; + scan_zone_unevictable_pages(zone); + } + return 1; +} + + +static SYSDEV_ATTR(scan_unevictable_pages, S_IRUGO | S_IWUSR, + read_scan_unevictable_node, + write_scan_unevictable_node); + +int scan_unevictable_register_node(struct node *node) +{ + return sysdev_create_file(&node->sysdev, &attr_scan_unevictable_pages); +} + +void scan_unevictable_unregister_node(struct node *node) +{ + sysdev_remove_file(&node->sysdev, &attr_scan_unevictable_pages); +} + +#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index d7826af..9343227 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -619,8 +619,14 @@ const struct seq_operations pagetypeinfo_op = { static const char * const vmstat_text[] = { /* Zoned VM counters */ "nr_free_pages", - "nr_inactive", - "nr_active", + "nr_inactive_anon", + "nr_active_anon", + "nr_inactive_file", + "nr_active_file", +#ifdef CONFIG_UNEVICTABLE_LRU + "nr_unevictable", + "nr_mlock", +#endif "nr_anon_pages", "nr_mapped", "nr_file_pages", @@ -675,6 +681,16 @@ static const char * const vmstat_text[] = { "htlb_buddy_alloc_success", "htlb_buddy_alloc_fail", #endif +#ifdef CONFIG_UNEVICTABLE_LRU + "unevictable_pgs_culled", + "unevictable_pgs_scanned", + "unevictable_pgs_rescued", + "unevictable_pgs_mlocked", + "unevictable_pgs_munlocked", + "unevictable_pgs_cleared", + "unevictable_pgs_stranded", + "unevictable_pgs_mlockfreed", +#endif #endif }; @@ -688,7 +704,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, "\n min %lu" "\n low %lu" "\n high %lu" - "\n scanned %lu (a: %lu i: %lu)" + "\n scanned %lu (aa: %lu ia: %lu af: %lu if: %lu)" "\n spanned %lu" "\n present %lu", zone_page_state(zone, NR_FREE_PAGES), @@ -696,7 +712,10 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, zone->pages_low, zone->pages_high, zone->pages_scanned, - zone->nr_scan_active, zone->nr_scan_inactive, + zone->lru[LRU_ACTIVE_ANON].nr_scan, + zone->lru[LRU_INACTIVE_ANON].nr_scan, + zone->lru[LRU_ACTIVE_FILE].nr_scan, + zone->lru[LRU_INACTIVE_FILE].nr_scan, zone->spanned_pages, zone->present_pages); @@ -733,10 +752,12 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat, seq_printf(m, "\n all_unreclaimable: %u" "\n prev_priority: %i" - "\n start_pfn: %lu", + "\n start_pfn: %lu" + "\n inactive_ratio: %u", zone_is_all_unreclaimable(zone), zone->prev_priority, - zone->zone_start_pfn); + zone->zone_start_pfn, + zone->inactive_ratio); seq_putc(m, '\n'); } |