diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/hugetlb.c | 2 | ||||
-rw-r--r-- | mm/mempolicy.c | 2 | ||||
-rw-r--r-- | mm/mmap.c | 6 | ||||
-rw-r--r-- | mm/nommu.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 13 | ||||
-rw-r--r-- | mm/slab.c | 14 | ||||
-rw-r--r-- | mm/slub.c | 15 | ||||
-rw-r--r-- | mm/sparse.c | 14 | ||||
-rw-r--r-- | mm/vmscan.c | 69 |
9 files changed, 113 insertions, 24 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d7ca59d..de4cf45 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -643,7 +643,7 @@ int follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(&mm->page_table_lock); ret = hugetlb_fault(mm, vma, vaddr, 0); spin_lock(&mm->page_table_lock); - if (!(ret & VM_FAULT_MAJOR)) + if (!(ret & VM_FAULT_ERROR)) continue; remainder = 0; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 71b84b4..172abff 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -149,7 +149,7 @@ static struct zonelist *bind_zonelist(nodemask_t *nodes) lower zones etc. Avoid empty zones because the memory allocator doesn't like them. If you implement node hot removal you have to fix that. */ - k = policy_zone; + k = MAX_NR_ZONES - 1; while (1) { for_each_node_mask(nd, *nodes) { struct zone *z = &NODE_DATA(nd)->node_zones[k]; @@ -93,7 +93,7 @@ atomic_t vm_committed_space = ATOMIC_INIT(0); * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ -int __vm_enough_memory(long pages, int cap_sys_admin) +int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { unsigned long free, allowed; @@ -166,7 +166,7 @@ int __vm_enough_memory(long pages, int cap_sys_admin) /* Don't let a single process grow too big: leave 3% of the size of this process for other processes */ - allowed -= current->mm->total_vm / 32; + allowed -= mm->total_vm / 32; /* * cast `allowed' as a signed long because vm_committed_space @@ -2077,7 +2077,7 @@ int insert_vm_struct(struct mm_struct * mm, struct vm_area_struct * vma) if (__vma && __vma->vm_start < vma->vm_end) return -ENOMEM; if ((vma->vm_flags & VM_ACCOUNT) && - security_vm_enough_memory(vma_pages(vma))) + security_vm_enough_memory_mm(mm, vma_pages(vma))) return -ENOMEM; vma_link(mm, vma, prev, rb_link, rb_parent); return 0; @@ -1270,7 +1270,7 @@ EXPORT_SYMBOL(get_unmapped_area); * Note this is a helper function intended to be used by LSMs which * wish to use this logic. */ -int __vm_enough_memory(long pages, int cap_sys_admin) +int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin) { unsigned long free, allowed; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3da85b8..6427653 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1157,6 +1157,7 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int order, nodemask_t *allowednodes = NULL;/* zonelist_cache approximation */ int zlc_active = 0; /* set if using zonelist_cache */ int did_zlc_setup = 0; /* just call zlc_setup() one time */ + enum zone_type highest_zoneidx = -1; /* Gets set for policy zonelists */ zonelist_scan: /* @@ -1166,6 +1167,18 @@ zonelist_scan: z = zonelist->zones; do { + /* + * In NUMA, this could be a policy zonelist which contains + * zones that may not be allowed by the current gfp_mask. + * Check the zone is allowed by the current flags + */ + if (unlikely(alloc_should_filter_zonelist(zonelist))) { + if (highest_zoneidx == -1) + highest_zoneidx = gfp_zone(gfp_mask); + if (zone_idx(*z) > highest_zoneidx) + continue; + } + if (NUMA_BUILD && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; @@ -883,6 +883,7 @@ static void __slab_error(const char *function, struct kmem_cache *cachep, */ static int use_alien_caches __read_mostly = 1; +static int numa_platform __read_mostly = 1; static int __init noaliencache_setup(char *s) { use_alien_caches = 0; @@ -1399,8 +1400,10 @@ void __init kmem_cache_init(void) int order; int node; - if (num_possible_nodes() == 1) + if (num_possible_nodes() == 1) { use_alien_caches = 0; + numa_platform = 0; + } for (i = 0; i < NUM_INIT_LISTS; i++) { kmem_list3_init(&initkmem_list3[i]); @@ -3558,7 +3561,14 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp) check_irq_off(); objp = cache_free_debugcheck(cachep, objp, __builtin_return_address(0)); - if (cache_free_alien(cachep, objp)) + /* + * Skip calling cache_free_alien() when the platform is not numa. + * This will avoid cache misses that happen while accessing slabp (which + * is per page memory reference) to get nodeid. Instead use a global + * variable to skip the call, which is mostly likely to be present in + * the cache. + */ + if (numa_platform && cache_free_alien(cachep, objp)) return; if (likely(ac->avail < ac->limit)) { @@ -1877,9 +1877,16 @@ static struct kmem_cache_node * __init early_kmem_cache_node_alloc(gfp_t gfpflag BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node)); - page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node); + page = new_slab(kmalloc_caches, gfpflags, node); BUG_ON(!page); + if (page_to_nid(page) != node) { + printk(KERN_ERR "SLUB: Unable to allocate memory from " + "node %d\n", node); + printk(KERN_ERR "SLUB: Allocating a useless per node structure " + "in order to be able to continue\n"); + } + n = page->freelist; BUG_ON(!n); page->freelist = get_freepointer(kmalloc_caches, n); @@ -3112,7 +3119,7 @@ static int list_locations(struct kmem_cache *s, char *buf, unsigned long flags; struct page *page; - if (!atomic_read(&n->nr_slabs)) + if (!atomic_long_read(&n->nr_slabs)) continue; spin_lock_irqsave(&n->list_lock, flags); @@ -3247,7 +3254,7 @@ static unsigned long slab_objects(struct kmem_cache *s, } if (flags & SO_FULL) { - int full_slabs = atomic_read(&n->nr_slabs) + int full_slabs = atomic_long_read(&n->nr_slabs) - per_cpu[node] - n->nr_partial; @@ -3283,7 +3290,7 @@ static int any_slab_objects(struct kmem_cache *s) for_each_node(node) { struct kmem_cache_node *n = get_node(s, node); - if (n->nr_partial || atomic_read(&n->nr_slabs)) + if (n->nr_partial || atomic_long_read(&n->nr_slabs)) return 1; } return 0; diff --git a/mm/sparse.c b/mm/sparse.c index 3047bf0..239f5a7 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -41,6 +41,15 @@ int page_to_nid(struct page *page) return section_to_node_table[page_to_section(page)]; } EXPORT_SYMBOL(page_to_nid); + +static void set_section_nid(unsigned long section_nr, int nid) +{ + section_to_node_table[section_nr] = nid; +} +#else /* !NODE_NOT_IN_PAGE_FLAGS */ +static inline void set_section_nid(unsigned long section_nr, int nid) +{ +} #endif #ifdef CONFIG_SPARSEMEM_EXTREME @@ -68,10 +77,6 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) struct mem_section *section; int ret = 0; -#ifdef NODE_NOT_IN_PAGE_FLAGS - section_to_node_table[section_nr] = nid; -#endif - if (mem_section[root]) return -EEXIST; @@ -148,6 +153,7 @@ void __init memory_present(int nid, unsigned long start, unsigned long end) struct mem_section *ms; sparse_index_init(section, nid); + set_section_nid(section, nid); ms = __nr_to_section(section); if (!ms->section_mem_map) diff --git a/mm/vmscan.c b/mm/vmscan.c index d419e10..a6e65d0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -271,6 +271,12 @@ static void handle_write_error(struct address_space *mapping, unlock_page(page); } +/* Request for sync pageout. */ +enum pageout_io { + PAGEOUT_IO_ASYNC, + PAGEOUT_IO_SYNC, +}; + /* possible outcome of pageout() */ typedef enum { /* failed to write page out, page is locked */ @@ -287,7 +293,8 @@ typedef enum { * pageout is called by shrink_page_list() for each dirty page. * Calls ->writepage(). */ -static pageout_t pageout(struct page *page, struct address_space *mapping) +static pageout_t pageout(struct page *page, struct address_space *mapping, + enum pageout_io sync_writeback) { /* * If the page is dirty, only perform writeback if that write @@ -346,6 +353,15 @@ static pageout_t pageout(struct page *page, struct address_space *mapping) ClearPageReclaim(page); return PAGE_ACTIVATE; } + + /* + * Wait on writeback if requested to. This happens when + * direct reclaiming a large contiguous area and the + * first attempt to free a range of pages fails. + */ + if (PageWriteback(page) && sync_writeback == PAGEOUT_IO_SYNC) + wait_on_page_writeback(page); + if (!PageWriteback(page)) { /* synchronous write or broken a_ops? */ ClearPageReclaim(page); @@ -423,7 +439,8 @@ cannot_free: * shrink_page_list() returns the number of reclaimed pages */ static unsigned long shrink_page_list(struct list_head *page_list, - struct scan_control *sc) + struct scan_control *sc, + enum pageout_io sync_writeback) { LIST_HEAD(ret_pages); struct pagevec freed_pvec; @@ -458,8 +475,23 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (page_mapped(page) || PageSwapCache(page)) sc->nr_scanned++; - if (PageWriteback(page)) - goto keep_locked; + may_enter_fs = (sc->gfp_mask & __GFP_FS) || + (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + + if (PageWriteback(page)) { + /* + * Synchronous reclaim is performed in two passes, + * first an asynchronous pass over the list to + * start parallel writeback, and a second synchronous + * pass to wait for the IO to complete. Wait here + * for any page for which writeback has already + * started. + */ + if (sync_writeback == PAGEOUT_IO_SYNC && may_enter_fs) + wait_on_page_writeback(page); + else + goto keep_locked; + } referenced = page_referenced(page, 1); /* In active use or really unfreeable? Activate it. */ @@ -478,8 +510,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, #endif /* CONFIG_SWAP */ mapping = page_mapping(page); - may_enter_fs = (sc->gfp_mask & __GFP_FS) || - (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); /* * The page is mapped into the page tables of one or more @@ -505,7 +535,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, goto keep_locked; /* Page is dirty, try to write it out here */ - switch(pageout(page, mapping)) { + switch (pageout(page, mapping, sync_writeback)) { case PAGE_KEEP: goto keep_locked; case PAGE_ACTIVATE: @@ -777,6 +807,7 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, (sc->order > PAGE_ALLOC_COSTLY_ORDER)? ISOLATE_BOTH : ISOLATE_INACTIVE); nr_active = clear_active_flags(&page_list); + __count_vm_events(PGDEACTIVATE, nr_active); __mod_zone_page_state(zone, NR_ACTIVE, -nr_active); __mod_zone_page_state(zone, NR_INACTIVE, @@ -785,7 +816,29 @@ static unsigned long shrink_inactive_list(unsigned long max_scan, spin_unlock_irq(&zone->lru_lock); nr_scanned += nr_scan; - nr_freed = shrink_page_list(&page_list, sc); + nr_freed = shrink_page_list(&page_list, sc, PAGEOUT_IO_ASYNC); + + /* + * If we are direct reclaiming for contiguous pages and we do + * not reclaim everything in the list, try again and wait + * for IO to complete. This will stall high-order allocations + * but that should be acceptable to the caller + */ + if (nr_freed < nr_taken && !current_is_kswapd() && + sc->order > PAGE_ALLOC_COSTLY_ORDER) { + congestion_wait(WRITE, HZ/10); + + /* + * The attempt at page out may have made some + * of the pages active, mark them inactive again. + */ + nr_active = clear_active_flags(&page_list); + count_vm_events(PGDEACTIVATE, nr_active); + + nr_freed += shrink_page_list(&page_list, sc, + PAGEOUT_IO_SYNC); + } + nr_reclaimed += nr_freed; local_irq_disable(); if (current_is_kswapd()) { |