From 0db0628d90125193280eabb501c94feaf48fa9ab Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 19 Jun 2013 14:53:51 -0400 Subject: kernel: delete __cpuinit usage from all core kernel files The __cpuinit type of throwaway sections might have made sense some time ago when RAM was more constrained, but now the savings do not offset the cost and complications. For example, the fix in commit 5e427ec2d0 ("x86: Fix bit corruption at CPU resume time") is a good example of the nasty type of bugs that can be created with improper use of the various __init prefixes. After a discussion on LKML[1] it was decided that cpuinit should go the way of devinit and be phased out. Once all the users are gone, we can then finally remove the macros themselves from linux/init.h. This removes all the uses of the __cpuinit macros from C files in the core kernel directories (kernel, init, lib, mm, and include) that don't really have a specific maintainer. [1] https://lkml.org/lkml/2013/5/20/589 Signed-off-by: Paul Gortmaker --- mm/memcontrol.c | 2 +- mm/page-writeback.c | 4 ++-- mm/slab.c | 10 +++++----- mm/slub.c | 4 ++-- mm/vmstat.c | 6 +++--- 5 files changed, 13 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d12ca6f..00a7a66 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2522,7 +2522,7 @@ static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) spin_unlock(&memcg->pcp_counter_lock); } -static int __cpuinit memcg_cpu_hotplug_callback(struct notifier_block *nb, +static int memcg_cpu_hotplug_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 4514ad7..3f0c895 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1619,7 +1619,7 @@ void writeback_set_ratelimit(void) ratelimit_pages = 16; } -static int __cpuinit +static int ratelimit_handler(struct notifier_block *self, unsigned long action, void *hcpu) { @@ -1634,7 +1634,7 @@ ratelimit_handler(struct notifier_block *self, unsigned long action, } } -static struct notifier_block __cpuinitdata ratelimit_nb = { +static struct notifier_block ratelimit_nb = { .notifier_call = ratelimit_handler, .next = NULL, }; diff --git a/mm/slab.c b/mm/slab.c index 35cb0c8..2580db0 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -787,7 +787,7 @@ static void next_reap_node(void) * the CPUs getting into lockstep and contending for the global cache chain * lock. */ -static void __cpuinit start_cpu_timer(int cpu) +static void start_cpu_timer(int cpu) { struct delayed_work *reap_work = &per_cpu(slab_reap_work, cpu); @@ -1186,7 +1186,7 @@ static inline int slabs_tofree(struct kmem_cache *cachep, return (n->free_objects + cachep->num - 1) / cachep->num; } -static void __cpuinit cpuup_canceled(long cpu) +static void cpuup_canceled(long cpu) { struct kmem_cache *cachep; struct kmem_cache_node *n = NULL; @@ -1251,7 +1251,7 @@ free_array_cache: } } -static int __cpuinit cpuup_prepare(long cpu) +static int cpuup_prepare(long cpu) { struct kmem_cache *cachep; struct kmem_cache_node *n = NULL; @@ -1334,7 +1334,7 @@ bad: return -ENOMEM; } -static int __cpuinit cpuup_callback(struct notifier_block *nfb, +static int cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -1390,7 +1390,7 @@ static int __cpuinit cpuup_callback(struct notifier_block *nfb, return notifier_from_errno(err); } -static struct notifier_block __cpuinitdata cpucache_notifier = { +static struct notifier_block cpucache_notifier = { &cpuup_callback, NULL, 0 }; diff --git a/mm/slub.c b/mm/slub.c index 3b482c8..2b02d66 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3773,7 +3773,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags) * Use the cpu notifier to insure that the cpu slabs are flushed when * necessary. */ -static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, +static int slab_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { long cpu = (long)hcpu; @@ -3799,7 +3799,7 @@ static int __cpuinit slab_cpuup_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata slab_notifier = { +static struct notifier_block slab_notifier = { .notifier_call = slab_cpuup_callback }; diff --git a/mm/vmstat.c b/mm/vmstat.c index f42745e..20c2ef4 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1182,7 +1182,7 @@ static void vmstat_update(struct work_struct *w) round_jiffies_relative(sysctl_stat_interval)); } -static void __cpuinit start_cpu_timer(int cpu) +static void start_cpu_timer(int cpu) { struct delayed_work *work = &per_cpu(vmstat_work, cpu); @@ -1194,7 +1194,7 @@ static void __cpuinit start_cpu_timer(int cpu) * Use the cpu notifier to insure that the thresholds are recalculated * when necessary. */ -static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, +static int vmstat_cpuup_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { @@ -1226,7 +1226,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb, return NOTIFY_OK; } -static struct notifier_block __cpuinitdata vmstat_notifier = +static struct notifier_block vmstat_notifier = { &vmstat_cpuup_callback, NULL, 0 }; #endif -- cgit v1.1 From b9b3259746d77f4fcb786e2a43c25bcc40773755 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 14 Jul 2013 16:05:51 -0700 Subject: sysfs.h: add __ATTR_RW() macro A number of parts of the kernel created their own version of this, might as well have the sysfs core provide it instead. Reviewed-by: Guenter Roeck Tested-by: Guenter Roeck Signed-off-by: Greg Kroah-Hartman --- mm/backing-dev.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index d014ee5..e04454c 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -232,8 +232,6 @@ static ssize_t stable_pages_required_show(struct device *dev, bdi_cap_stable_pages_required(bdi) ? 1 : 0); } -#define __ATTR_RW(attr) __ATTR(attr, 0644, attr##_show, attr##_store) - static struct device_attribute bdi_dev_attrs[] = { __ATTR_RW(read_ahead_kb), __ATTR_RW(min_ratio), -- cgit v1.1 From 3964acd0dbec123aa0a621973a2a0580034b4788 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 31 Jul 2013 13:53:28 -0700 Subject: mm: mempolicy: fix mbind_range() && vma_adjust() interaction vma_adjust() does vma_set_policy(vma, vma_policy(next)) and this is doubly wrong: 1. This leaks vma->vm_policy if it is not NULL and not equal to next->vm_policy. This can happen if vma_merge() expands "area", not prev (case 8). 2. This sets the wrong policy if vma_merge() joins prev and area, area is the vma the caller needs to update and it still has the old policy. Revert commit 1444f92c8498 ("mm: merging memory blocks resets mempolicy") which introduced these problems. Change mbind_range() to recheck mpol_equal() after vma_merge() to fix the problem that commit tried to address. Signed-off-by: Oleg Nesterov Acked-by: KOSAKI Motohiro Cc: Steven T Hampson Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: Rik van Riel Cc: Andi Kleen Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 6 +++++- mm/mmap.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 7431001..4baf12e 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -732,7 +732,10 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (prev) { vma = prev; next = vma->vm_next; - continue; + if (mpol_equal(vma_policy(vma), new_pol)) + continue; + /* vma_merge() joined vma && vma->next, case 8 */ + goto replace; } if (vma->vm_start != vmstart) { err = split_vma(vma->vm_mm, vma, vmstart, 1); @@ -744,6 +747,7 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, if (err) goto out; } + replace: err = vma_replace_policy(vma, new_pol); if (err) goto out; diff --git a/mm/mmap.c b/mm/mmap.c index fbad7b0..1edbaa3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -865,7 +865,7 @@ again: remove_next = 1 + (end > next->vm_end); if (next->anon_vma) anon_vma_merge(vma, next); mm->map_count--; - vma_set_policy(vma, vma_policy(next)); + mpol_put(vma_policy(next)); kmem_cache_free(vm_area_cachep, next); /* * In mprotect's case 6 (see comments on vma_merge), -- cgit v1.1 From ef2a2cbdda7e9d084a85846770fcc844958881f6 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 31 Jul 2013 13:53:37 -0700 Subject: mm/swap.c: clear PageActive before adding pages onto unevictable list As a result of commit 13f7f78981e4 ("mm: pagevec: defer deciding which LRU to add a page to until pagevec drain time"), pages on unevictable lists can have both of PageActive and PageUnevictable set. This is not only confusing, but also corrupts page migration and shrink_[in]active_list. This patch fixes the problem by adding ClearPageActive before adding pages into unevictable list. It also cleans up VM_BUG_ONs. Signed-off-by: Naoya Horiguchi Cc: Mel Gorman Cc: KOSAKI Motohiro Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/swap.c b/mm/swap.c index 4a1d0d2..d3982c0 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -512,12 +512,7 @@ EXPORT_SYMBOL(__lru_cache_add); */ void lru_cache_add(struct page *page) { - if (PageActive(page)) { - VM_BUG_ON(PageUnevictable(page)); - } else if (PageUnevictable(page)) { - VM_BUG_ON(PageActive(page)); - } - + VM_BUG_ON(PageActive(page) && PageUnevictable(page)); VM_BUG_ON(PageLRU(page)); __lru_cache_add(page); } @@ -539,6 +534,7 @@ void add_page_to_unevictable_list(struct page *page) spin_lock_irq(&zone->lru_lock); lruvec = mem_cgroup_page_lruvec(page, zone); + ClearPageActive(page); SetPageUnevictable(page); SetPageLRU(page); add_page_to_lru_list(page, lruvec, LRU_UNEVICTABLE); @@ -833,7 +829,6 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, int active = PageActive(page); enum lru_list lru = page_lru(page); - VM_BUG_ON(PageUnevictable(page)); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); -- cgit v1.1 From e180cf806a93ea1abbce47b245d25204ff557ce9 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 31 Jul 2013 13:53:39 -0700 Subject: thp, mm: avoid PageUnevictable on active/inactive lru lists active/inactive lru lists can contain unevicable pages (i.e. ramfs pages that have been placed on the LRU lists when first allocated), but these pages must not have PageUnevictable set - otherwise shrink_[in]active_list goes crazy: kernel BUG at /home/space/kas/git/public/linux-next/mm/vmscan.c:1122! 1090 static unsigned long isolate_lru_pages(unsigned long nr_to_scan, 1091 struct lruvec *lruvec, struct list_head *dst, 1092 unsigned long *nr_scanned, struct scan_control *sc, 1093 isolate_mode_t mode, enum lru_list lru) 1094 { ... 1108 switch (__isolate_lru_page(page, mode)) { 1109 case 0: ... 1116 case -EBUSY: ... 1121 default: 1122 BUG(); 1123 } 1124 } ... 1130 } __isolate_lru_page() returns EINVAL for PageUnevictable(page). For lru_add_page_tail(), it means we should not set PageUnevictable() for tail pages unless we're sure that it will go to LRU_UNEVICTABLE. Let's just copy PG_active and PG_unevictable from head page in __split_huge_page_refcount(), it will simplify lru_add_page_tail(). This will fix one more bug in lru_add_page_tail(): if page_evictable(page_tail) is false and PageLRU(page) is true, page_tail will go to the same lru as page, but nobody cares to sync page_tail active/inactive state with page. So we can end up with inactive page on active lru. The patch will fix it as well since we copy PG_active from head page. Signed-off-by: Kirill A. Shutemov Acked-by: Dave Hansen Cc: Naoya Horiguchi Cc: Mel Gorman Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 4 +++- mm/swap.c | 20 ++------------------ 2 files changed, 5 insertions(+), 19 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 243e710..a92012a 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1620,7 +1620,9 @@ static void __split_huge_page_refcount(struct page *page, ((1L << PG_referenced) | (1L << PG_swapbacked) | (1L << PG_mlocked) | - (1L << PG_uptodate))); + (1L << PG_uptodate) | + (1L << PG_active) | + (1L << PG_unevictable))); page_tail->flags |= (1L << PG_dirty); /* clear PageTail before overwriting first_page */ diff --git a/mm/swap.c b/mm/swap.c index d3982c0..62b78a6 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -770,8 +770,6 @@ EXPORT_SYMBOL(__pagevec_release); void lru_add_page_tail(struct page *page, struct page *page_tail, struct lruvec *lruvec, struct list_head *list) { - int uninitialized_var(active); - enum lru_list lru; const int file = 0; VM_BUG_ON(!PageHead(page)); @@ -783,20 +781,6 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, if (!list) SetPageLRU(page_tail); - if (page_evictable(page_tail)) { - if (PageActive(page)) { - SetPageActive(page_tail); - active = 1; - lru = LRU_ACTIVE_ANON; - } else { - active = 0; - lru = LRU_INACTIVE_ANON; - } - } else { - SetPageUnevictable(page_tail); - lru = LRU_UNEVICTABLE; - } - if (likely(PageLRU(page))) list_add_tail(&page_tail->lru, &page->lru); else if (list) { @@ -812,13 +796,13 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, * Use the standard add function to put page_tail on the list, * but then correct its position so they all end up in order. */ - add_page_to_lru_list(page_tail, lruvec, lru); + add_page_to_lru_list(page_tail, lruvec, page_lru(page_tail)); list_head = page_tail->lru.prev; list_move_tail(&page_tail->lru, list_head); } if (!PageUnevictable(page)) - update_page_reclaim_stat(lruvec, file, active); + update_page_reclaim_stat(lruvec, file, PageActive(page_tail)); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -- cgit v1.1 From 9d8c5b5284e4a0167c5f7b1193692492358b8700 Mon Sep 17 00:00:00 2001 From: Heesub Shin Date: Wed, 31 Jul 2013 13:53:40 -0700 Subject: mm: zbud: fix condition check on allocation size zbud_alloc() incorrectly verifies the size of allocation limit. It should deny the allocation request greater than (PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE), not (PAGE_SIZE - ZHDR_SIZE_ALIGNED) which has no remaining spaces for its buddy. There is no point in spending the entire zbud page storing only a single page, since we don't have any benefits. Signed-off-by: Heesub Shin Acked-by: Seth Jennings Cc: Bob Liu Cc: Dongjun Shin Cc: Sunae Seo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zbud.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/zbud.c b/mm/zbud.c index 9bb4710..ad1e781 100644 --- a/mm/zbud.c +++ b/mm/zbud.c @@ -257,7 +257,7 @@ int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, if (size <= 0 || gfp & __GFP_HIGHMEM) return -EINVAL; - if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED) + if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE) return -ENOSPC; chunks = size_to_chunks(size); spin_lock(&pool->lock); -- cgit v1.1 From 22f2020f84c6da2dd0acb2dce12e39e59ff7c8be Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jul 2013 13:53:48 -0700 Subject: vmpressure: change vmpressure::sr_lock to spinlock There is nothing that can sleep inside critical sections protected by this lock and those sections are really small so there doesn't make much sense to use mutex for them. Change the log to a spinlock Signed-off-by: Michal Hocko Reported-by: Tejun Heo Cc: Anton Vorontsov Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Li Zefan Reviewed-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmpressure.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 736a601..f4ee6a1 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -180,12 +180,12 @@ static void vmpressure_work_fn(struct work_struct *work) if (!vmpr->scanned) return; - mutex_lock(&vmpr->sr_lock); + spin_lock(&vmpr->sr_lock); scanned = vmpr->scanned; reclaimed = vmpr->reclaimed; vmpr->scanned = 0; vmpr->reclaimed = 0; - mutex_unlock(&vmpr->sr_lock); + spin_unlock(&vmpr->sr_lock); do { if (vmpressure_event(vmpr, scanned, reclaimed)) @@ -240,11 +240,11 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, if (!scanned) return; - mutex_lock(&vmpr->sr_lock); + spin_lock(&vmpr->sr_lock); vmpr->scanned += scanned; vmpr->reclaimed += reclaimed; scanned = vmpr->scanned; - mutex_unlock(&vmpr->sr_lock); + spin_unlock(&vmpr->sr_lock); if (scanned < vmpressure_win || work_pending(&vmpr->work)) return; @@ -367,7 +367,7 @@ void vmpressure_unregister_event(struct cgroup *cg, struct cftype *cft, */ void vmpressure_init(struct vmpressure *vmpr) { - mutex_init(&vmpr->sr_lock); + spin_lock_init(&vmpr->sr_lock); mutex_init(&vmpr->events_lock); INIT_LIST_HEAD(&vmpr->events); INIT_WORK(&vmpr->work, vmpressure_work_fn); -- cgit v1.1 From 8e0ed445b3478468372449859c45c6b3032acf2f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jul 2013 13:53:50 -0700 Subject: vmpressure: do not check for pending work to prevent from new work because it is racy and it doesn't give us much anyway as schedule_work handles this case already. Signed-off-by: Michal Hocko Reported-by: Tejun Heo Cc: Anton Vorontsov Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Li Zefan Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmpressure.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmpressure.c b/mm/vmpressure.c index f4ee6a1..192f9731 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -246,7 +246,7 @@ void vmpressure(gfp_t gfp, struct mem_cgroup *memcg, scanned = vmpr->scanned; spin_unlock(&vmpr->sr_lock); - if (scanned < vmpressure_win || work_pending(&vmpr->work)) + if (scanned < vmpressure_win) return; schedule_work(&vmpr->work); } -- cgit v1.1 From 33cb876e947b9ddda8dca3fb99234b743a597ef9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jul 2013 13:53:51 -0700 Subject: vmpressure: make sure there are no events queued after memcg is offlined vmpressure is called synchronously from reclaim where the target_memcg is guaranteed to be alive but the eventfd is signaled from the work queue context. This means that memcg (along with vmpressure structure which is embedded into it) might go away while the work item is pending which would result in use-after-release bug. We have two possible ways how to fix this. Either vmpressure pins memcg before it schedules vmpr->work and unpin it in vmpressure_work_fn or explicitely flush the work item from the css_offline context (as suggested by Tejun). This patch implements the later one and it introduces vmpressure_cleanup which flushes the vmpressure work queue item item. It hooks into mem_cgroup_css_offline after the memcg itself is cleaned up. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Michal Hocko Reported-by: Tejun Heo Cc: Anton Vorontsov Cc: Johannes Weiner Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Cc: Li Zefan Acked-by: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 1 + mm/vmpressure.c | 16 ++++++++++++++++ 2 files changed, 17 insertions(+) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 00a7a66..c290a1c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6335,6 +6335,7 @@ static void mem_cgroup_css_offline(struct cgroup *cont) mem_cgroup_invalidate_reclaim_iterators(memcg); mem_cgroup_reparent_charges(memcg); mem_cgroup_destroy_all_caches(memcg); + vmpressure_cleanup(&memcg->vmpressure); } static void mem_cgroup_css_free(struct cgroup *cont) diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 192f9731..0c1e37d 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -372,3 +372,19 @@ void vmpressure_init(struct vmpressure *vmpr) INIT_LIST_HEAD(&vmpr->events); INIT_WORK(&vmpr->work, vmpressure_work_fn); } + +/** + * vmpressure_cleanup() - shuts down vmpressure control structure + * @vmpr: Structure to be cleaned up + * + * This function should be called before the structure in which it is + * embedded is cleaned up. + */ +void vmpressure_cleanup(struct vmpressure *vmpr) +{ + /* + * Make sure there is no pending work before eventfd infrastructure + * goes away. + */ + flush_work(&vmpr->work); +} -- cgit v1.1 From 387aae6fdd737038e92d7bb40712bdf6dcb11945 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 4 Aug 2013 11:30:25 -0700 Subject: tmpfs: fix SEEK_DATA/SEEK_HOLE regression Commit 46a1c2c7ae53 ("vfs: export lseek_execute() to modules") broke the tmpfs SEEK_DATA/SEEK_HOLE implementation, because vfs_setpos() converts the carefully prepared -ENXIO to -EINVAL. Other filesystems avoid it in error cases: do the same in tmpfs. Signed-off-by: Hugh Dickins Cc: Jie Liu Cc: Al Viro Signed-off-by: Linus Torvalds --- mm/shmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/shmem.c b/mm/shmem.c index a87990c..8335dbd 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1798,7 +1798,8 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) } } - offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); + if (offset >= 0) + offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); mutex_unlock(&inode->i_mutex); return offset; } -- cgit v1.1 From 370905069ce6515f38d5de0a5b1c899cbe58fe22 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 8 Aug 2013 09:06:37 -0700 Subject: Revert "slub: do not put a slab to cpu partial list when cpu_partial is 0" This reverts commit 318df36e57c0ca9f2146660d41ff28e8650af423. This commit caused Steven Rostedt's hackbench runs to run out of memory due to a leak. As noted by Joonsoo Kim, it is buggy in the following scenario: "I guess, you may set 0 to all kmem caches's cpu_partial via sysfs, doesn't it? In this case, memory leak is possible in following case. Code flow of possible leak is follwing case. * in __slab_free() 1. (!new.inuse || !prior) && !was_frozen 2. !kmem_cache_debug && !prior 3. new.frozen = 1 4. after cmpxchg_double_slab, run the (!n) case with new.frozen=1 5. with this patch, put_cpu_partial() doesn't do anything, because this cache's cpu_partial is 0 6. return In step 5, leak occur" And Steven does indeed have cpu_partial set to 0 due to RT testing. Joonsoo is cooking up a patch, but everybody agrees that reverting this for now is the right thing to do. Reported-and-bisected-by: Steven Rostedt Acked-by: Joonsoo Kim Acked-by: Pekka Enberg Signed-off-by: Linus Torvalds --- mm/slub.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index 2b02d66..e3ba1f2 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1968,9 +1968,6 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) int pages; int pobjects; - if (!s->cpu_partial) - return; - do { pages = 0; pobjects = 0; -- cgit v1.1 From 3e6b11df245180949938734bc192eaf32f3a06b3 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Tue, 13 Aug 2013 16:00:47 -0700 Subject: memcg: don't initialize kmem-cache destroying work for root caches struct memcg_cache_params has a union. Different parts of this union are used for root and non-root caches. A part with destroying work is used only for non-root caches. I fixed the same problem in another place v3.9-rc1-16204-gf101a94, but didn't notice this one. This patch fixes the kernel panic: [ 46.848187] BUG: unable to handle kernel paging request at 000000fffffffeb8 [ 46.849026] IP: [] kmem_cache_destroy_memcg_children+0x6c/0xc0 [ 46.849092] PGD 0 [ 46.849092] Oops: 0000 [#1] SMP ... Signed-off-by: Andrey Vagin Cc: Glauber Costa Cc: Johannes Weiner Acked-by: Michal Hocko Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Konstantin Khlebnikov Cc: [3.9.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index c290a1c..c5792a5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3195,11 +3195,11 @@ int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, if (!s->memcg_params) return -ENOMEM; - INIT_WORK(&s->memcg_params->destroy, - kmem_cache_destroy_work_func); if (memcg) { s->memcg_params->memcg = memcg; s->memcg_params->root_cache = root_cache; + INIT_WORK(&s->memcg_params->destroy, + kmem_cache_destroy_work_func); } else s->memcg_params->is_root_cache = true; -- cgit v1.1 From 179ef71cbc085252e3fe6b8159263a7ed1d88ea4 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 13 Aug 2013 16:00:49 -0700 Subject: mm: save soft-dirty bits on swapped pages Andy Lutomirski reported that if a page with _PAGE_SOFT_DIRTY bit set get swapped out, the bit is getting lost and no longer available when pte read back. To resolve this we introduce _PTE_SWP_SOFT_DIRTY bit which is saved in pte entry for the page being swapped out. When such page is to be read back from a swap cache we check for bit presence and if it's there we clear it and restore the former _PAGE_SOFT_DIRTY bit back. One of the problem was to find a place in pte entry where we can save the _PTE_SWP_SOFT_DIRTY bit while page is in swap. The _PAGE_PSE was chosen for that, it doesn't intersect with swap entry format stored in pte. Reported-by: Andy Lutomirski Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" Reviewed-by: Minchan Kim Reviewed-by: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 2 ++ mm/rmap.c | 6 +++++- mm/swapfile.c | 19 +++++++++++++++++-- 3 files changed, 24 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index 1ce2e2a..e98ecad2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3115,6 +3115,8 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma, exclusive = 1; } flush_icache_page(vma, page); + if (pte_swp_soft_dirty(orig_pte)) + pte = pte_mksoft_dirty(pte); set_pte_at(mm, address, page_table, pte); if (page == swapcache) do_page_add_anon_rmap(page, vma, address, exclusive); diff --git a/mm/rmap.c b/mm/rmap.c index cd356df..83325b8 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1236,6 +1236,7 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, swp_entry_to_pte(make_hwpoison_entry(page))); } else if (PageAnon(page)) { swp_entry_t entry = { .val = page_private(page) }; + pte_t swp_pte; if (PageSwapCache(page)) { /* @@ -1264,7 +1265,10 @@ int try_to_unmap_one(struct page *page, struct vm_area_struct *vma, BUG_ON(TTU_ACTION(flags) != TTU_MIGRATION); entry = make_migration_entry(page, pte_write(pteval)); } - set_pte_at(mm, address, pte, swp_entry_to_pte(entry)); + swp_pte = swp_entry_to_pte(entry); + if (pte_soft_dirty(pteval)) + swp_pte = pte_swp_mksoft_dirty(swp_pte); + set_pte_at(mm, address, pte, swp_pte); BUG_ON(pte_file(*pte)); } else if (IS_ENABLED(CONFIG_MIGRATION) && (TTU_ACTION(flags) == TTU_MIGRATION)) { diff --git a/mm/swapfile.c b/mm/swapfile.c index 36af6ee..6cf2e60 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -866,6 +866,21 @@ unsigned int count_swap_pages(int type, int free) } #endif /* CONFIG_HIBERNATION */ +static inline int maybe_same_pte(pte_t pte, pte_t swp_pte) +{ +#ifdef CONFIG_MEM_SOFT_DIRTY + /* + * When pte keeps soft dirty bit the pte generated + * from swap entry does not has it, still it's same + * pte from logical point of view. + */ + pte_t swp_pte_dirty = pte_swp_mksoft_dirty(swp_pte); + return pte_same(pte, swp_pte) || pte_same(pte, swp_pte_dirty); +#else + return pte_same(pte, swp_pte); +#endif +} + /* * No need to decide whether this PTE shares the swap entry with others, * just let do_wp_page work it out if a write is requested later - to @@ -892,7 +907,7 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, } pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl); - if (unlikely(!pte_same(*pte, swp_entry_to_pte(entry)))) { + if (unlikely(!maybe_same_pte(*pte, swp_entry_to_pte(entry)))) { mem_cgroup_cancel_charge_swapin(memcg); ret = 0; goto out; @@ -947,7 +962,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, * swapoff spends a _lot_ of time in this loop! * Test inline before going to call unuse_pte. */ - if (unlikely(pte_same(*pte, swp_pte))) { + if (unlikely(maybe_same_pte(*pte, swp_pte))) { pte_unmap(pte); ret = unuse_pte(vma, pmd, addr, entry, page); if (ret) -- cgit v1.1 From 41bb3476b361ef38576cf9d539b19bae2ac93167 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Tue, 13 Aug 2013 16:00:51 -0700 Subject: mm: save soft-dirty bits on file pages Andy reported that if file page get reclaimed we lose the soft-dirty bit if it was there, so save _PAGE_BIT_SOFT_DIRTY bit when page address get encoded into pte entry. Thus when #pf happens on such non-present pte we can restore it back. Reported-by: Andy Lutomirski Signed-off-by: Cyrill Gorcunov Acked-by: Pavel Emelyanov Cc: Matt Mackall Cc: Xiao Guangrong Cc: Marcelo Tosatti Cc: KOSAKI Motohiro Cc: Stephen Rothwell Cc: Peter Zijlstra Cc: "Aneesh Kumar K.V" Cc: Minchan Kim Cc: Wanpeng Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fremap.c | 11 ++++++++--- mm/memory.c | 11 ++++++++--- mm/rmap.c | 8 ++++++-- 3 files changed, 22 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/fremap.c b/mm/fremap.c index 87da359..5bff081 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -57,17 +57,22 @@ static int install_file_pte(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, unsigned long pgoff, pgprot_t prot) { int err = -ENOMEM; - pte_t *pte; + pte_t *pte, ptfile; spinlock_t *ptl; pte = get_locked_pte(mm, addr, &ptl); if (!pte) goto out; - if (!pte_none(*pte)) + ptfile = pgoff_to_pte(pgoff); + + if (!pte_none(*pte)) { + if (pte_present(*pte) && pte_soft_dirty(*pte)) + pte_file_mksoft_dirty(ptfile); zap_pte(mm, vma, addr, pte); + } - set_pte_at(mm, addr, pte, pgoff_to_pte(pgoff)); + set_pte_at(mm, addr, pte, ptfile); /* * We don't need to run update_mmu_cache() here because the "file pte" * being installed by install_file_pte() is not a real pte - it's a diff --git a/mm/memory.c b/mm/memory.c index e98ecad2..4026841 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1141,9 +1141,12 @@ again: continue; if (unlikely(details) && details->nonlinear_vma && linear_page_index(details->nonlinear_vma, - addr) != page->index) - set_pte_at(mm, addr, pte, - pgoff_to_pte(page->index)); + addr) != page->index) { + pte_t ptfile = pgoff_to_pte(page->index); + if (pte_soft_dirty(ptent)) + pte_file_mksoft_dirty(ptfile); + set_pte_at(mm, addr, pte, ptfile); + } if (PageAnon(page)) rss[MM_ANONPAGES]--; else { @@ -3410,6 +3413,8 @@ static int __do_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = mk_pte(page, vma->vm_page_prot); if (flags & FAULT_FLAG_WRITE) entry = maybe_mkwrite(pte_mkdirty(entry), vma); + else if (pte_file(orig_pte) && pte_file_soft_dirty(orig_pte)) + pte_mksoft_dirty(entry); if (anon) { inc_mm_counter_fast(mm, MM_ANONPAGES); page_add_new_anon_rmap(page, vma, address); diff --git a/mm/rmap.c b/mm/rmap.c index 83325b8..b2e29ac 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1405,8 +1405,12 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, pteval = ptep_clear_flush(vma, address, pte); /* If nonlinear, store the file page offset in the pte. */ - if (page->index != linear_page_index(vma, address)) - set_pte_at(mm, address, pte, pgoff_to_pte(page->index)); + if (page->index != linear_page_index(vma, address)) { + pte_t ptfile = pgoff_to_pte(page->index); + if (pte_soft_dirty(pteval)) + pte_file_mksoft_dirty(ptfile); + set_pte_at(mm, address, pte, ptfile); + } /* Move the dirty bit to the physical page now the pte is gone. */ if (pte_dirty(pteval)) -- cgit v1.1 From 2b047252d087be7f2ba088b4933cd904f92e6fce Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 15 Aug 2013 11:42:25 -0700 Subject: Fix TLB gather virtual address range invalidation corner cases Ben Tebulin reported: "Since v3.7.2 on two independent machines a very specific Git repository fails in 9/10 cases on git-fsck due to an SHA1/memory failures. This only occurs on a very specific repository and can be reproduced stably on two independent laptops. Git mailing list ran out of ideas and for me this looks like some very exotic kernel issue" and bisected the failure to the backport of commit 53a59fc67f97 ("mm: limit mmu_gather batching to fix soft lockups on !CONFIG_PREEMPT"). That commit itself is not actually buggy, but what it does is to make it much more likely to hit the partial TLB invalidation case, since it introduces a new case in tlb_next_batch() that previously only ever happened when running out of memory. The real bug is that the TLB gather virtual memory range setup is subtly buggered. It was introduced in commit 597e1c3580b7 ("mm/mmu_gather: enable tlb flush range in generic mmu_gather"), and the range handling was already fixed at least once in commit e6c495a96ce0 ("mm: fix the TLB range flushed when __tlb_remove_page() runs out of slots"), but that fix was not complete. The problem with the TLB gather virtual address range is that it isn't set up by the initial tlb_gather_mmu() initialization (which didn't get the TLB range information), but it is set up ad-hoc later by the functions that actually flush the TLB. And so any such case that forgot to update the TLB range entries would potentially miss TLB invalidates. Rather than try to figure out exactly which particular ad-hoc range setup was missing (I personally suspect it's the hugetlb case in zap_huge_pmd(), which didn't have the same logic as zap_pte_range() did), this patch just gets rid of the problem at the source: make the TLB range information available to tlb_gather_mmu(), and initialize it when initializing all the other tlb gather fields. This makes the patch larger, but conceptually much simpler. And the end result is much more understandable; even if you want to play games with partial ranges when invalidating the TLB contents in chunks, now the range information is always there, and anybody who doesn't want to bother with it won't introduce subtle bugs. Ben verified that this fixes his problem. Reported-bisected-and-tested-by: Ben Tebulin Build-testing-by: Stephen Rothwell Build-testing-by: Richard Weinberger Reviewed-by: Michal Hocko Acked-by: Peter Zijlstra Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 2 +- mm/memory.c | 36 +++++++++++++++++++++--------------- mm/mmap.c | 4 ++-- 3 files changed, 24 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 83aff0a..b60f330 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2490,7 +2490,7 @@ void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, mm = vma->vm_mm; - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, start, end); __unmap_hugepage_range(&tlb, vma, start, end, ref_page); tlb_finish_mmu(&tlb, start, end); } diff --git a/mm/memory.c b/mm/memory.c index 4026841..af84bc0 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -209,14 +209,15 @@ static int tlb_next_batch(struct mmu_gather *tlb) * tear-down from @mm. The @fullmm argument is used when @mm is without * users and we're going to destroy the full address space (exit/execve). */ -void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, bool fullmm) +void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) { tlb->mm = mm; - tlb->fullmm = fullmm; + /* Is it from 0 to ~0? */ + tlb->fullmm = !(start | (end+1)); tlb->need_flush_all = 0; - tlb->start = -1UL; - tlb->end = 0; + tlb->start = start; + tlb->end = end; tlb->need_flush = 0; tlb->local.next = NULL; tlb->local.nr = 0; @@ -256,8 +257,6 @@ void tlb_finish_mmu(struct mmu_gather *tlb, unsigned long start, unsigned long e { struct mmu_gather_batch *batch, *next; - tlb->start = start; - tlb->end = end; tlb_flush_mmu(tlb); /* keep the page table cache within bounds */ @@ -1099,7 +1098,6 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; pte_t *start_pte; pte_t *pte; - unsigned long range_start = addr; again: init_rss_vec(rss); @@ -1205,17 +1203,25 @@ again: * and page-free while holding it. */ if (force_flush) { + unsigned long old_end; + force_flush = 0; -#ifdef HAVE_GENERIC_MMU_GATHER - tlb->start = range_start; + /* + * Flush the TLB just for the previous segment, + * then update the range to be the remaining + * TLB range. + */ + old_end = tlb->end; tlb->end = addr; -#endif + tlb_flush_mmu(tlb); - if (addr != end) { - range_start = addr; + + tlb->start = addr; + tlb->end = old_end; + + if (addr != end) goto again; - } } return addr; @@ -1400,7 +1406,7 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long end = start + size; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, start, end); for ( ; vma && vma->vm_start < end; vma = vma->vm_next) @@ -1426,7 +1432,7 @@ static void zap_page_range_single(struct vm_area_struct *vma, unsigned long addr unsigned long end = address + size; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, address, end); update_hiwater_rss(mm); mmu_notifier_invalidate_range_start(mm, address, end); unmap_single_vma(&tlb, vma, address, end, details); diff --git a/mm/mmap.c b/mm/mmap.c index 1edbaa3..f9c97d1 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2336,7 +2336,7 @@ static void unmap_region(struct mm_struct *mm, struct mmu_gather tlb; lru_add_drain(); - tlb_gather_mmu(&tlb, mm, 0); + tlb_gather_mmu(&tlb, mm, start, end); update_hiwater_rss(mm); unmap_vmas(&tlb, vma, start, end); free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, @@ -2709,7 +2709,7 @@ void exit_mmap(struct mm_struct *mm) lru_add_drain(); flush_cache_mm(mm); - tlb_gather_mmu(&tlb, mm, 1); + tlb_gather_mmu(&tlb, mm, 0, -1); /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use -1 here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, vma, 0, -1); -- cgit v1.1