diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/filemap.c | 86 | ||||
-rw-r--r-- | mm/memcontrol.c | 4 | ||||
-rw-r--r-- | mm/page-writeback.c | 2 | ||||
-rw-r--r-- | mm/page_alloc.c | 44 | ||||
-rw-r--r-- | mm/slab.c | 56 | ||||
-rw-r--r-- | mm/slub.c | 36 | ||||
-rw-r--r-- | mm/swapfile.c | 3 | ||||
-rw-r--r-- | mm/truncate.c | 40 |
8 files changed, 211 insertions, 60 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index c3811bc..79c4b2b 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -813,20 +813,19 @@ EXPORT_SYMBOL(find_or_create_page); unsigned find_get_pages(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages) { - unsigned int i; - unsigned int ret; - unsigned int nr_found, nr_skip; + struct radix_tree_iter iter; + void **slot; + unsigned ret = 0; + + if (unlikely(!nr_pages)) + return 0; rcu_read_lock(); restart: - nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, NULL, start, nr_pages); - ret = 0; - nr_skip = 0; - for (i = 0; i < nr_found; i++) { + radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) { struct page *page; repeat: - page = radix_tree_deref_slot((void **)pages[i]); + page = radix_tree_deref_slot(slot); if (unlikely(!page)) continue; @@ -837,7 +836,7 @@ repeat: * when entry at index 0 moves out of or back * to root: none yet gotten, safe to restart. */ - WARN_ON(start | i); + WARN_ON(iter.index); goto restart; } /* @@ -845,7 +844,6 @@ repeat: * here as an exceptional entry: so skip over it - * we only reach this from invalidate_mapping_pages(). */ - nr_skip++; continue; } @@ -853,21 +851,16 @@ repeat: goto repeat; /* Has the page moved? */ - if (unlikely(page != *((void **)pages[i]))) { + if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } pages[ret] = page; - ret++; + if (++ret == nr_pages) + break; } - /* - * If all entries were removed before we could secure them, - * try again, because callers stop trying once 0 is returned. - */ - if (unlikely(!ret && nr_found > nr_skip)) - goto restart; rcu_read_unlock(); return ret; } @@ -887,21 +880,22 @@ repeat: unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, unsigned int nr_pages, struct page **pages) { - unsigned int i; - unsigned int ret; - unsigned int nr_found; + struct radix_tree_iter iter; + void **slot; + unsigned int ret = 0; + + if (unlikely(!nr_pages)) + return 0; rcu_read_lock(); restart: - nr_found = radix_tree_gang_lookup_slot(&mapping->page_tree, - (void ***)pages, NULL, index, nr_pages); - ret = 0; - for (i = 0; i < nr_found; i++) { + radix_tree_for_each_contig(slot, &mapping->page_tree, &iter, index) { struct page *page; repeat: - page = radix_tree_deref_slot((void **)pages[i]); + page = radix_tree_deref_slot(slot); + /* The hole, there no reason to continue */ if (unlikely(!page)) - continue; + break; if (radix_tree_exception(page)) { if (radix_tree_deref_retry(page)) { @@ -924,7 +918,7 @@ repeat: goto repeat; /* Has the page moved? */ - if (unlikely(page != *((void **)pages[i]))) { + if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } @@ -934,14 +928,14 @@ repeat: * otherwise we can get both false positives and false * negatives, which is just confusing to the caller. */ - if (page->mapping == NULL || page->index != index) { + if (page->mapping == NULL || page->index != iter.index) { page_cache_release(page); break; } pages[ret] = page; - ret++; - index++; + if (++ret == nr_pages) + break; } rcu_read_unlock(); return ret; @@ -962,19 +956,20 @@ EXPORT_SYMBOL(find_get_pages_contig); unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, int tag, unsigned int nr_pages, struct page **pages) { - unsigned int i; - unsigned int ret; - unsigned int nr_found; + struct radix_tree_iter iter; + void **slot; + unsigned ret = 0; + + if (unlikely(!nr_pages)) + return 0; rcu_read_lock(); restart: - nr_found = radix_tree_gang_lookup_tag_slot(&mapping->page_tree, - (void ***)pages, *index, nr_pages, tag); - ret = 0; - for (i = 0; i < nr_found; i++) { + radix_tree_for_each_tagged(slot, &mapping->page_tree, + &iter, *index, tag) { struct page *page; repeat: - page = radix_tree_deref_slot((void **)pages[i]); + page = radix_tree_deref_slot(slot); if (unlikely(!page)) continue; @@ -998,21 +993,16 @@ repeat: goto repeat; /* Has the page moved? */ - if (unlikely(page != *((void **)pages[i]))) { + if (unlikely(page != *slot)) { page_cache_release(page); goto repeat; } pages[ret] = page; - ret++; + if (++ret == nr_pages) + break; } - /* - * If all entries were removed before we could secure them, - * try again, because callers stop trying once 0 is returned. - */ - if (unlikely(!ret && nr_found)) - goto restart; rcu_read_unlock(); if (ret) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b2ee6df..7d698df 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5306,6 +5306,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, return 0; } + if (pmd_trans_unstable(pmd)) + return 0; pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; pte++, addr += PAGE_SIZE) if (get_mctgt_type(vma, addr, *pte, NULL)) @@ -5502,6 +5504,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, return 0; } + if (pmd_trans_unstable(pmd)) + return 0; retry: pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); for (; addr != end; addr += PAGE_SIZE) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 3fc2617..26adea8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -95,6 +95,8 @@ unsigned long vm_dirty_bytes; */ unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */ +EXPORT_SYMBOL_GPL(dirty_writeback_interval); + /* * The longest time for which data is allowed to remain dirty */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index caea788..a712fb9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1161,11 +1161,47 @@ void drain_local_pages(void *arg) } /* - * Spill all the per-cpu pages from all CPUs back into the buddy allocator + * Spill all the per-cpu pages from all CPUs back into the buddy allocator. + * + * Note that this code is protected against sending an IPI to an offline + * CPU but does not guarantee sending an IPI to newly hotplugged CPUs: + * on_each_cpu_mask() blocks hotplug and won't talk to offlined CPUs but + * nothing keeps CPUs from showing up after we populated the cpumask and + * before the call to on_each_cpu_mask(). */ void drain_all_pages(void) { - on_each_cpu(drain_local_pages, NULL, 1); + int cpu; + struct per_cpu_pageset *pcp; + struct zone *zone; + + /* + * Allocate in the BSS so we wont require allocation in + * direct reclaim path for CONFIG_CPUMASK_OFFSTACK=y + */ + static cpumask_t cpus_with_pcps; + + /* + * We don't care about racing with CPU hotplug event + * as offline notification will cause the notified + * cpu to drain that CPU pcps and on_each_cpu_mask + * disables preemption as part of its processing + */ + for_each_online_cpu(cpu) { + bool has_pcps = false; + for_each_populated_zone(zone) { + pcp = per_cpu_ptr(zone->pageset, cpu); + if (pcp->pcp.count) { + has_pcps = true; + break; + } + } + if (has_pcps) + cpumask_set_cpu(cpu, &cpus_with_pcps); + else + cpumask_clear_cpu(cpu, &cpus_with_pcps); + } + on_each_cpu_mask(&cpus_with_pcps, drain_local_pages, NULL, 1); } #ifdef CONFIG_HIBERNATION @@ -2308,6 +2344,10 @@ rebalance: if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { if (oom_killer_disabled) goto nopage; + /* Coredumps can quickly deplete all memory reserves */ + if ((current->flags & PF_DUMPCORE) && + !(gfp_mask & __GFP_NOFAIL)) + goto nopage; page = __alloc_pages_may_oom(gfp_mask, order, zonelist, high_zoneidx, nodemask, preferred_zone, @@ -1731,6 +1731,52 @@ static int __init cpucache_init(void) } __initcall(cpucache_init); +static noinline void +slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) +{ + struct kmem_list3 *l3; + struct slab *slabp; + unsigned long flags; + int node; + + printk(KERN_WARNING + "SLAB: Unable to allocate memory on node %d (gfp=0x%x)\n", + nodeid, gfpflags); + printk(KERN_WARNING " cache: %s, object size: %d, order: %d\n", + cachep->name, cachep->buffer_size, cachep->gfporder); + + for_each_online_node(node) { + unsigned long active_objs = 0, num_objs = 0, free_objects = 0; + unsigned long active_slabs = 0, num_slabs = 0; + + l3 = cachep->nodelists[node]; + if (!l3) + continue; + + spin_lock_irqsave(&l3->list_lock, flags); + list_for_each_entry(slabp, &l3->slabs_full, list) { + active_objs += cachep->num; + active_slabs++; + } + list_for_each_entry(slabp, &l3->slabs_partial, list) { + active_objs += slabp->inuse; + active_slabs++; + } + list_for_each_entry(slabp, &l3->slabs_free, list) + num_slabs++; + + free_objects += l3->free_objects; + spin_unlock_irqrestore(&l3->list_lock, flags); + + num_slabs += active_slabs; + num_objs = num_slabs * cachep->num; + printk(KERN_WARNING + " node %d: slabs: %ld/%ld, objs: %ld/%ld, free: %ld\n", + node, active_slabs, num_slabs, active_objs, num_objs, + free_objects); + } +} + /* * Interface to system's page allocator. No need to hold the cache-lock. * @@ -1757,8 +1803,11 @@ static void *kmem_getpages(struct kmem_cache *cachep, gfp_t flags, int nodeid) flags |= __GFP_RECLAIMABLE; page = alloc_pages_exact_node(nodeid, flags | __GFP_NOTRACK, cachep->gfporder); - if (!page) + if (!page) { + if (!(flags & __GFP_NOWARN) && printk_ratelimit()) + slab_out_of_memory(cachep, flags, nodeid); return NULL; + } nr_pages = (1 << cachep->gfporder); if (cachep->flags & SLAB_RECLAIM_ACCOUNT) @@ -3696,13 +3745,12 @@ static inline void __cache_free(struct kmem_cache *cachep, void *objp, if (likely(ac->avail < ac->limit)) { STATS_INC_FREEHIT(cachep); - ac->entry[ac->avail++] = objp; - return; } else { STATS_INC_FREEMISS(cachep); cache_flusharray(cachep, ac); - ac->entry[ac->avail++] = objp; } + + ac->entry[ac->avail++] = objp; } /** @@ -29,6 +29,7 @@ #include <linux/math64.h> #include <linux/fault-inject.h> #include <linux/stacktrace.h> +#include <linux/prefetch.h> #include <trace/events/kmem.h> @@ -269,6 +270,11 @@ static inline void *get_freepointer(struct kmem_cache *s, void *object) return *(void **)(object + s->offset); } +static void prefetch_freepointer(const struct kmem_cache *s, void *object) +{ + prefetch(object + s->offset); +} + static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) { void *p; @@ -1560,6 +1566,7 @@ static void *get_partial_node(struct kmem_cache *s, } else { page->freelist = t; available = put_cpu_partial(s, page, 0); + stat(s, CPU_PARTIAL_NODE); } if (kmem_cache_debug(s) || available > s->cpu_partial / 2) break; @@ -1983,6 +1990,7 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) local_irq_restore(flags); pobjects = 0; pages = 0; + stat(s, CPU_PARTIAL_DRAIN); } } @@ -1994,7 +2002,6 @@ int put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) page->next = oldpage; } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); - stat(s, CPU_PARTIAL_FREE); return pobjects; } @@ -2028,9 +2035,17 @@ static void flush_cpu_slab(void *d) __flush_cpu_slab(s, smp_processor_id()); } +static bool has_cpu_slab(int cpu, void *info) +{ + struct kmem_cache *s = info; + struct kmem_cache_cpu *c = per_cpu_ptr(s->cpu_slab, cpu); + + return !!(c->page); +} + static void flush_all(struct kmem_cache *s) { - on_each_cpu(flush_cpu_slab, s, 1); + on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1, GFP_ATOMIC); } /* @@ -2319,6 +2334,8 @@ redo: object = __slab_alloc(s, gfpflags, node, addr, c); else { + void *next_object = get_freepointer_safe(s, object); + /* * The cmpxchg will only match if there was no additional * operation and if we are on the right processor. @@ -2334,11 +2351,12 @@ redo: if (unlikely(!this_cpu_cmpxchg_double( s->cpu_slab->freelist, s->cpu_slab->tid, object, tid, - get_freepointer_safe(s, object), next_tid(tid)))) { + next_object, next_tid(tid)))) { note_cmpxchg_failure("slab_alloc", s, tid); goto redo; } + prefetch_freepointer(s, next_object); stat(s, ALLOC_FASTPATH); } @@ -2475,9 +2493,10 @@ static void __slab_free(struct kmem_cache *s, struct page *page, * If we just froze the page then put it onto the * per cpu partial list. */ - if (new.frozen && !was_frozen) + if (new.frozen && !was_frozen) { put_cpu_partial(s, page, 1); - + stat(s, CPU_PARTIAL_FREE); + } /* * The list lock was not taken therefore no list * activity can be necessary. @@ -3939,13 +3958,14 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size, if (kmem_cache_open(s, n, size, align, flags, ctor)) { list_add(&s->list, &slab_caches); + up_write(&slub_lock); if (sysfs_slab_add(s)) { + down_write(&slub_lock); list_del(&s->list); kfree(n); kfree(s); goto err; } - up_write(&slub_lock); return s; } kfree(n); @@ -5069,6 +5089,8 @@ STAT_ATTR(CMPXCHG_DOUBLE_CPU_FAIL, cmpxchg_double_cpu_fail); STAT_ATTR(CMPXCHG_DOUBLE_FAIL, cmpxchg_double_fail); STAT_ATTR(CPU_PARTIAL_ALLOC, cpu_partial_alloc); STAT_ATTR(CPU_PARTIAL_FREE, cpu_partial_free); +STAT_ATTR(CPU_PARTIAL_NODE, cpu_partial_node); +STAT_ATTR(CPU_PARTIAL_DRAIN, cpu_partial_drain); #endif static struct attribute *slab_attrs[] = { @@ -5134,6 +5156,8 @@ static struct attribute *slab_attrs[] = { &cmpxchg_double_cpu_fail_attr.attr, &cpu_partial_alloc_attr.attr, &cpu_partial_free_attr.attr, + &cpu_partial_node_attr.attr, + &cpu_partial_drain_attr.attr, #endif #ifdef CONFIG_FAILSLAB &failslab_attr.attr, diff --git a/mm/swapfile.c b/mm/swapfile.c index dae42f3..fafc26d 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2022,6 +2022,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) struct page *page = NULL; struct inode *inode = NULL; + if (swap_flags & ~SWAP_FLAGS_VALID) + return -EINVAL; + if (!capable(CAP_SYS_ADMIN)) return -EPERM; diff --git a/mm/truncate.c b/mm/truncate.c index 18aded3..61a183b 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -626,3 +626,43 @@ int vmtruncate_range(struct inode *inode, loff_t lstart, loff_t lend) return 0; } + +/** + * truncate_pagecache_range - unmap and remove pagecache that is hole-punched + * @inode: inode + * @lstart: offset of beginning of hole + * @lend: offset of last byte of hole + * + * This function should typically be called before the filesystem + * releases resources associated with the freed range (eg. deallocates + * blocks). This way, pagecache will always stay logically coherent + * with on-disk format, and the filesystem would not have to deal with + * situations such as writepage being called for a page that has already + * had its underlying blocks deallocated. + */ +void truncate_pagecache_range(struct inode *inode, loff_t lstart, loff_t lend) +{ + struct address_space *mapping = inode->i_mapping; + loff_t unmap_start = round_up(lstart, PAGE_SIZE); + loff_t unmap_end = round_down(1 + lend, PAGE_SIZE) - 1; + /* + * This rounding is currently just for example: unmap_mapping_range + * expands its hole outwards, whereas we want it to contract the hole + * inwards. However, existing callers of truncate_pagecache_range are + * doing their own page rounding first; and truncate_inode_pages_range + * currently BUGs if lend is not pagealigned-1 (it handles partial + * page at start of hole, but not partial page at end of hole). Note + * unmap_mapping_range allows holelen 0 for all, and we allow lend -1. + */ + + /* + * Unlike in truncate_pagecache, unmap_mapping_range is called only + * once (before truncating pagecache), and without "even_cows" flag: + * hole-punching should not remove private COWed pages from the hole. + */ + if ((u64)unmap_end > (u64)unmap_start) + unmap_mapping_range(mapping, unmap_start, + 1 + unmap_end - unmap_start, 0); + truncate_inode_pages_range(mapping, lstart, lend); +} +EXPORT_SYMBOL(truncate_pagecache_range); |