From d0e1d66b5aa1ec9f556f951aa9a114cc192cd01c Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 11 Dec 2012 16:00:21 -0800 Subject: writeback: remove nr_pages_dirtied arg from balance_dirty_pages_ratelimited_nr() There is no reason to pass the nr_pages_dirtied argument, because nr_pages_dirtied value from the caller is unused in balance_dirty_pages_ratelimited_nr(). Signed-off-by: Namjae Jeon Signed-off-by: Vivek Trivedi Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 830893b..6f427122 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1069,7 +1069,7 @@ static void bdi_update_bandwidth(struct backing_dev_info *bdi, } /* - * After a task dirtied this many pages, balance_dirty_pages_ratelimited_nr() + * After a task dirtied this many pages, balance_dirty_pages_ratelimited() * will look to see if it needs to start dirty throttling. * * If dirty_poll_interval is too low, big NUMA machines will call the expensive @@ -1436,9 +1436,8 @@ static DEFINE_PER_CPU(int, bdp_ratelimits); DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; /** - * balance_dirty_pages_ratelimited_nr - balance dirty memory state + * balance_dirty_pages_ratelimited - balance dirty memory state * @mapping: address_space which was dirtied - * @nr_pages_dirtied: number of pages which the caller has just dirtied * * Processes which are dirtying memory should call in here once for each page * which was newly dirtied. The function will periodically check the system's @@ -1449,8 +1448,7 @@ DEFINE_PER_CPU(int, dirty_throttle_leaks) = 0; * limit we decrease the ratelimiting by a lot, to prevent individual processes * from overshooting the limit by (ratelimit_pages) each. */ -void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, - unsigned long nr_pages_dirtied) +void balance_dirty_pages_ratelimited(struct address_space *mapping) { struct backing_dev_info *bdi = mapping->backing_dev_info; int ratelimit; @@ -1484,6 +1482,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, */ p = &__get_cpu_var(dirty_throttle_leaks); if (*p > 0 && current->nr_dirtied < ratelimit) { + unsigned long nr_pages_dirtied; nr_pages_dirtied = min(*p, ratelimit - current->nr_dirtied); *p -= nr_pages_dirtied; current->nr_dirtied += nr_pages_dirtied; @@ -1493,7 +1492,7 @@ void balance_dirty_pages_ratelimited_nr(struct address_space *mapping, if (unlikely(current->nr_dirtied >= ratelimit)) balance_dirty_pages(mapping, current->nr_dirtied); } -EXPORT_SYMBOL(balance_dirty_pages_ratelimited_nr); +EXPORT_SYMBOL(balance_dirty_pages_ratelimited); void throttle_vm_writeout(gfp_t gfp_mask) { -- cgit v1.1 From 377e4f167664d8bc390c04c911d846366000159c Mon Sep 17 00:00:00 2001 From: Rabin Vincent Date: Tue, 11 Dec 2012 16:00:24 -0800 Subject: mm: show migration types in show_mem This is useful to diagnose the reason for page allocation failure for cases where there appear to be several free pages. Example, with this alloc_pages(GFP_ATOMIC) failure: swapper/0: page allocation failure: order:0, mode:0x0 ... Mem-info: Normal per-cpu: CPU 0: hi: 90, btch: 15 usd: 48 CPU 1: hi: 90, btch: 15 usd: 21 active_anon:0 inactive_anon:0 isolated_anon:0 active_file:0 inactive_file:84 isolated_file:0 unevictable:0 dirty:0 writeback:0 unstable:0 free:4026 slab_reclaimable:75 slab_unreclaimable:484 mapped:0 shmem:0 pagetables:0 bounce:0 Normal free:16104kB min:2296kB low:2868kB high:3444kB active_anon:0kB inactive_anon:0kB active_file:0kB inactive_file:336kB unevictable:0kB isolated(anon):0kB isolated(file):0kB present:331776kB mlocked:0kB dirty:0kB writeback:0kB mapped:0kB shmem:0kB slab_reclaimable:300kB slab_unreclaimable:1936kB kernel_stack:328kB pagetables:0kB unstable:0kB bounce:0kB writeback_tmp:0kB pages_scanned:0 all_unreclaimable? no lowmem_reserve[]: 0 0 Before the patch, it's hard (for me, at least) to say why all these free chunks weren't considered for allocation: Normal: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 1*256kB 1*512kB 1*1024kB 1*2048kB 3*4096kB = 16128kB After the patch, it's obvious that the reason is that all of these are in the MIGRATE_CMA (C) freelist: Normal: 0*4kB 0*8kB 0*16kB 0*32kB 0*64kB 0*128kB 1*256kB (C) 1*512kB (C) 1*1024kB (C) 1*2048kB (C) 3*4096kB (C) = 16128kB Signed-off-by: Rabin Vincent Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 42 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 40 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7e208f0..dc018b4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2877,6 +2877,31 @@ out: #define K(x) ((x) << (PAGE_SHIFT-10)) +static void show_migration_types(unsigned char type) +{ + static const char types[MIGRATE_TYPES] = { + [MIGRATE_UNMOVABLE] = 'U', + [MIGRATE_RECLAIMABLE] = 'E', + [MIGRATE_MOVABLE] = 'M', + [MIGRATE_RESERVE] = 'R', +#ifdef CONFIG_CMA + [MIGRATE_CMA] = 'C', +#endif + [MIGRATE_ISOLATE] = 'I', + }; + char tmp[MIGRATE_TYPES + 1]; + char *p = tmp; + int i; + + for (i = 0; i < MIGRATE_TYPES; i++) { + if (type & (1 << i)) + *p++ = types[i]; + } + + *p = '\0'; + printk("(%s) ", tmp); +} + /* * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the @@ -3005,6 +3030,7 @@ void show_free_areas(unsigned int filter) for_each_populated_zone(zone) { unsigned long nr[MAX_ORDER], flags, order, total = 0; + unsigned char types[MAX_ORDER]; if (skip_free_areas_node(filter, zone_to_nid(zone))) continue; @@ -3013,12 +3039,24 @@ void show_free_areas(unsigned int filter) spin_lock_irqsave(&zone->lock, flags); for (order = 0; order < MAX_ORDER; order++) { - nr[order] = zone->free_area[order].nr_free; + struct free_area *area = &zone->free_area[order]; + int type; + + nr[order] = area->nr_free; total += nr[order] << order; + + types[order] = 0; + for (type = 0; type < MIGRATE_TYPES; type++) { + if (!list_empty(&area->free_list[type])) + types[order] |= 1 << type; + } } spin_unlock_irqrestore(&zone->lock, flags); - for (order = 0; order < MAX_ORDER; order++) + for (order = 0; order < MAX_ORDER; order++) { printk("%lu*%lukB ", nr[order], K(1UL) << order); + if (nr[order]) + show_migration_types(types[order]); + } printk("= %lukB\n", K(total)); } -- cgit v1.1 From 19965460e31c73a934d2c19c152f876a75bdff3e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 11 Dec 2012 16:00:26 -0800 Subject: mm, memcg: make mem_cgroup_out_of_memory() static mem_cgroup_out_of_memory() is only referenced from within file scope, so it can be marked static. Signed-off-by: David Rientjes Acked-by: KAMEZAWA Hiroyuki Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index dd39ba0..cf6d0df 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1498,8 +1498,8 @@ static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) return limit; } -void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, - int order) +static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, + int order) { struct mem_cgroup *iter; unsigned long chosen_points = 0; -- cgit v1.1 From e5adfffc857788c8b7eca0e98cf1e26f1964b292 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 11 Dec 2012 16:00:29 -0800 Subject: mm: use IS_ENABLED(CONFIG_NUMA) instead of NUMA_BUILD We don't need custom NUMA_BUILD anymore, since we have handy IS_ENABLED(). Signed-off-by: Kirill A. Shutemov Acked-by: KOSAKI Motohiro Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 18 ++++++++++-------- mm/vmalloc.c | 4 ++-- 2 files changed, 12 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index dc018b4..a49b0ea 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1871,7 +1871,7 @@ zonelist_scan: */ for_each_zone_zonelist_nodemask(zone, z, zonelist, high_zoneidx, nodemask) { - if (NUMA_BUILD && zlc_active && + if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; if ((alloc_flags & ALLOC_CPUSET) && @@ -1917,7 +1917,8 @@ zonelist_scan: classzone_idx, alloc_flags)) goto try_this_zone; - if (NUMA_BUILD && !did_zlc_setup && nr_online_nodes > 1) { + if (IS_ENABLED(CONFIG_NUMA) && + !did_zlc_setup && nr_online_nodes > 1) { /* * we do zlc_setup if there are multiple nodes * and before considering the first zone allowed @@ -1936,7 +1937,7 @@ zonelist_scan: * As we may have just activated ZLC, check if the first * eligible zone has failed zone_reclaim recently. */ - if (NUMA_BUILD && zlc_active && + if (IS_ENABLED(CONFIG_NUMA) && zlc_active && !zlc_zone_worth_trying(zonelist, z, allowednodes)) continue; @@ -1962,11 +1963,11 @@ try_this_zone: if (page) break; this_zone_full: - if (NUMA_BUILD) + if (IS_ENABLED(CONFIG_NUMA)) zlc_mark_zone_full(zonelist, z); } - if (unlikely(NUMA_BUILD && page == NULL && zlc_active)) { + if (unlikely(IS_ENABLED(CONFIG_NUMA) && page == NULL && zlc_active)) { /* Disable zlc cache for second zonelist scan */ zlc_active = 0; goto zonelist_scan; @@ -2266,7 +2267,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order, return NULL; /* After successful reclaim, reconsider all zones for allocation */ - if (NUMA_BUILD) + if (IS_ENABLED(CONFIG_NUMA)) zlc_clear_zones_full(zonelist); retry: @@ -2412,7 +2413,8 @@ __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order, * allowed per node queues are empty and that nodes are * over allocated. */ - if (NUMA_BUILD && (gfp_mask & GFP_THISNODE) == GFP_THISNODE) + if (IS_ENABLED(CONFIG_NUMA) && + (gfp_mask & GFP_THISNODE) == GFP_THISNODE) goto nopage; restart: @@ -2819,7 +2821,7 @@ unsigned int nr_free_pagecache_pages(void) static inline void show_node(struct zone *zone) { - if (NUMA_BUILD) + if (IS_ENABLED(CONFIG_NUMA)) printk("Node %d ", zone_to_nid(zone)); } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 78e0830..5123a16 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2550,7 +2550,7 @@ static void s_stop(struct seq_file *m, void *p) static void show_numa_info(struct seq_file *m, struct vm_struct *v) { - if (NUMA_BUILD) { + if (IS_ENABLED(CONFIG_NUMA)) { unsigned int nr, *counters = m->private; if (!counters) @@ -2615,7 +2615,7 @@ static int vmalloc_open(struct inode *inode, struct file *file) unsigned int *ptr = NULL; int ret; - if (NUMA_BUILD) { + if (IS_ENABLED(CONFIG_NUMA)) { ptr = kmalloc(nr_node_ids * sizeof(unsigned int), GFP_KERNEL); if (ptr == NULL) return -ENOMEM; -- cgit v1.1 From d84da3f9e4f18809821562bd960e00a10673b341 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 11 Dec 2012 16:00:31 -0800 Subject: mm: use IS_ENABLED(CONFIG_COMPACTION) instead of COMPACTION_BUILD We don't need custom COMPACTION_BUILD anymore, since we have handy IS_ENABLED(). Signed-off-by: Kirill A. Shutemov Acked-by: Minchan Kim Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index b7ed376..a1ce17f 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1752,7 +1752,7 @@ out: /* Use reclaim/compaction for costly allocs or under memory pressure */ static bool in_reclaim_compaction(struct scan_control *sc) { - if (COMPACTION_BUILD && sc->order && + if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && (sc->order > PAGE_ALLOC_COSTLY_ORDER || sc->priority < DEF_PRIORITY - 2)) return true; @@ -2005,7 +2005,7 @@ static bool shrink_zones(struct zonelist *zonelist, struct scan_control *sc) if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY) continue; /* Let kswapd poll it */ - if (COMPACTION_BUILD) { + if (IS_ENABLED(CONFIG_COMPACTION)) { /* * If we already have plenty of memory free for * compaction in this zone, don't free any more. @@ -2421,7 +2421,8 @@ static bool zone_balanced(struct zone *zone, int order, balance_gap, classzone_idx, 0)) return false; - if (COMPACTION_BUILD && order && !compaction_suitable(zone, order)) + if (IS_ENABLED(CONFIG_COMPACTION) && order && + !compaction_suitable(zone, order)) return false; return true; @@ -2684,7 +2685,7 @@ loop_again: * Do not reclaim more than needed for compaction. */ testorder = order; - if (COMPACTION_BUILD && order && + if (IS_ENABLED(CONFIG_COMPACTION) && order && compaction_suitable(zone, order) != COMPACT_SKIPPED) testorder = 0; -- cgit v1.1 From 344aa35c27acdf70d3c67b5aa7cb6aa8585f80c1 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Tue, 11 Dec 2012 16:00:34 -0800 Subject: thp: clean up __collapse_huge_page_isolate There are duplicated places using release_pte_pages(). And release_all_pte_pages() can be removed. Signed-off-by: Bob Liu Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Minchan Kim Cc: Ni zhan Chen Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 38 +++++++++++--------------------------- 1 file changed, 11 insertions(+), 27 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 40f17c3..6f022f5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1701,64 +1701,49 @@ static void release_pte_pages(pte_t *pte, pte_t *_pte) } } -static void release_all_pte_pages(pte_t *pte) -{ - release_pte_pages(pte, pte + HPAGE_PMD_NR); -} - static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte) { struct page *page; pte_t *_pte; - int referenced = 0, isolated = 0, none = 0; + int referenced = 0, none = 0; for (_pte = pte; _pte < pte+HPAGE_PMD_NR; _pte++, address += PAGE_SIZE) { pte_t pteval = *_pte; if (pte_none(pteval)) { if (++none <= khugepaged_max_ptes_none) continue; - else { - release_pte_pages(pte, _pte); + else goto out; - } } - if (!pte_present(pteval) || !pte_write(pteval)) { - release_pte_pages(pte, _pte); + if (!pte_present(pteval) || !pte_write(pteval)) goto out; - } page = vm_normal_page(vma, address, pteval); - if (unlikely(!page)) { - release_pte_pages(pte, _pte); + if (unlikely(!page)) goto out; - } + VM_BUG_ON(PageCompound(page)); BUG_ON(!PageAnon(page)); VM_BUG_ON(!PageSwapBacked(page)); /* cannot use mapcount: can't collapse if there's a gup pin */ - if (page_count(page) != 1) { - release_pte_pages(pte, _pte); + if (page_count(page) != 1) goto out; - } /* * We can do it before isolate_lru_page because the * page can't be freed from under us. NOTE: PG_lock * is needed to serialize against split_huge_page * when invoked from the VM. */ - if (!trylock_page(page)) { - release_pte_pages(pte, _pte); + if (!trylock_page(page)) goto out; - } /* * Isolate the page to avoid collapsing an hugepage * currently in use by the VM. */ if (isolate_lru_page(page)) { unlock_page(page); - release_pte_pages(pte, _pte); goto out; } /* 0 stands for page_is_file_cache(page) == false */ @@ -1771,12 +1756,11 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, mmu_notifier_test_young(vma->vm_mm, address)) referenced = 1; } - if (unlikely(!referenced)) - release_all_pte_pages(pte); - else - isolated = 1; + if (likely(referenced)) + return 1; out: - return isolated; + release_pte_pages(pte, _pte); + return 0; } static void __collapse_huge_page_copy(pte_t *pte, struct page *page, -- cgit v1.1 From 6219049ae1ce32b89236646cccaec2a5fc6c4fd2 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Tue, 11 Dec 2012 16:00:37 -0800 Subject: mm: introduce mm_find_pmd() Several place need to find the pmd by(mm_struct, address), so introduce a function to simplify it. [akpm@linux-foundation.org: fix warning] Signed-off-by: Bob Liu Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Minchan Kim Cc: Ni zhan Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 55 ++++++++++--------------------------------------------- mm/internal.h | 5 +++++ mm/ksm.c | 14 ++------------ mm/migrate.c | 14 ++------------ mm/rmap.c | 48 +++++++++++++++++++++++++----------------------- 5 files changed, 44 insertions(+), 92 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 6f022f5..9ae9724 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1146,22 +1146,14 @@ pmd_t *page_check_address_pmd(struct page *page, unsigned long address, enum page_check_address_pmd_flag flag) { - pgd_t *pgd; - pud_t *pud; pmd_t *pmd, *ret = NULL; if (address & ~HPAGE_PMD_MASK) goto out; - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) + pmd = mm_find_pmd(mm, address); + if (!pmd) goto out; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, address); if (pmd_none(*pmd)) goto out; if (pmd_page(*pmd) != page) @@ -1908,8 +1900,6 @@ static void collapse_huge_page(struct mm_struct *mm, struct vm_area_struct *vma, int node) { - pgd_t *pgd; - pud_t *pud; pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; @@ -1955,17 +1945,10 @@ static void collapse_huge_page(struct mm_struct *mm, goto out; VM_BUG_ON(vma->vm_flags & VM_NO_THP); - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) + pmd = mm_find_pmd(mm, address); + if (!pmd) goto out; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, address); - /* pmd can't go away or become huge under us */ - if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + if (pmd_trans_huge(*pmd)) goto out; anon_vma_lock(vma->anon_vma); @@ -2048,8 +2031,6 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, unsigned long address, struct page **hpage) { - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; pte_t *pte, *_pte; int ret = 0, referenced = 0, none = 0; @@ -2060,16 +2041,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) + pmd = mm_find_pmd(mm, address); + if (!pmd) goto out; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd) || pmd_trans_huge(*pmd)) + if (pmd_trans_huge(*pmd)) goto out; pte = pte_offset_map_lock(mm, pmd, address, &ptl); @@ -2363,22 +2338,12 @@ void __split_huge_page_pmd(struct mm_struct *mm, pmd_t *pmd) static void split_huge_page_address(struct mm_struct *mm, unsigned long address) { - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; VM_BUG_ON(!(address & ~HPAGE_PMD_MASK)); - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - return; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return; - - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) + pmd = mm_find_pmd(mm, address); + if (!pmd) return; /* * Caller holds the mmap_sem write mode, so a huge pmd cannot diff --git a/mm/internal.h b/mm/internal.h index a4fa284..52d1fa9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -92,6 +92,11 @@ extern int isolate_lru_page(struct page *page); extern void putback_lru_page(struct page *page); /* + * in mm/rmap.c: + */ +extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); + +/* * in mm/page_alloc.c */ extern void __free_pages_bootmem(struct page *page, unsigned int order); diff --git a/mm/ksm.c b/mm/ksm.c index ae539f0..31ae5ea 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -778,8 +778,6 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage, pte_t orig_pte) { struct mm_struct *mm = vma->vm_mm; - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; pte_t *ptep; spinlock_t *ptl; @@ -792,18 +790,10 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, if (addr == -EFAULT) goto out; - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) + pmd = mm_find_pmd(mm, addr); + if (!pmd) goto out; - - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, addr); BUG_ON(pmd_trans_huge(*pmd)); - if (!pmd_present(*pmd)) - goto out; mmun_start = addr; mmun_end = addr + PAGE_SIZE; diff --git a/mm/migrate.c b/mm/migrate.c index 77ed2d7..1dc4598 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -91,8 +91,6 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, { struct mm_struct *mm = vma->vm_mm; swp_entry_t entry; - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; pte_t *ptep, pte; spinlock_t *ptl; @@ -103,19 +101,11 @@ static int remove_migration_pte(struct page *new, struct vm_area_struct *vma, goto out; ptl = &mm->page_table_lock; } else { - pgd = pgd_offset(mm, addr); - if (!pgd_present(*pgd)) + pmd = mm_find_pmd(mm, addr); + if (!pmd) goto out; - - pud = pud_offset(pgd, addr); - if (!pud_present(*pud)) - goto out; - - pmd = pmd_offset(pud, addr); if (pmd_trans_huge(*pmd)) goto out; - if (!pmd_present(*pmd)) - goto out; ptep = pte_offset_map(pmd, addr); diff --git a/mm/rmap.c b/mm/rmap.c index 2ee1ef0..46823fb 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -562,6 +562,27 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return address; } +pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) +{ + pgd_t *pgd; + pud_t *pud; + pmd_t *pmd = NULL; + + pgd = pgd_offset(mm, address); + if (!pgd_present(*pgd)) + goto out; + + pud = pud_offset(pgd, address); + if (!pud_present(*pud)) + goto out; + + pmd = pmd_offset(pud, address); + if (!pmd_present(*pmd)) + pmd = NULL; +out: + return pmd; +} + /* * Check that @page is mapped at @address into @mm. * @@ -574,8 +595,6 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) pte_t *__page_check_address(struct page *page, struct mm_struct *mm, unsigned long address, spinlock_t **ptlp, int sync) { - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; pte_t *pte; spinlock_t *ptl; @@ -586,17 +605,10 @@ pte_t *__page_check_address(struct page *page, struct mm_struct *mm, goto check; } - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - return NULL; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) + pmd = mm_find_pmd(mm, address); + if (!pmd) return NULL; - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) - return NULL; if (pmd_trans_huge(*pmd)) return NULL; @@ -1345,8 +1357,6 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, struct vm_area_struct *vma, struct page *check_page) { struct mm_struct *mm = vma->vm_mm; - pgd_t *pgd; - pud_t *pud; pmd_t *pmd; pte_t *pte; pte_t pteval; @@ -1366,16 +1376,8 @@ static int try_to_unmap_cluster(unsigned long cursor, unsigned int *mapcount, if (end > vma->vm_end) end = vma->vm_end; - pgd = pgd_offset(mm, address); - if (!pgd_present(*pgd)) - return ret; - - pud = pud_offset(pgd, address); - if (!pud_present(*pud)) - return ret; - - pmd = pmd_offset(pud, address); - if (!pmd_present(*pmd)) + pmd = mm_find_pmd(mm, address); + if (!pmd) return ret; mmun_start = address; -- cgit v1.1 From fa475e517adb422cb3492e636195f9b2c0d009c8 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Tue, 11 Dec 2012 16:00:39 -0800 Subject: thp: introduce hugepage_vma_check() Multiple places do the same check. Signed-off-by: Bob Liu Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Minchan Kim Cc: Ni zhan Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 38 +++++++++++++++++--------------------- 1 file changed, 17 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 9ae9724..2600268 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1894,6 +1894,20 @@ static struct page } #endif +static bool hugepage_vma_check(struct vm_area_struct *vma) +{ + if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || + (vma->vm_flags & VM_NOHUGEPAGE)) + return false; + + if (!vma->anon_vma || vma->vm_ops) + return false; + if (is_vma_temporary_stack(vma)) + return false; + VM_BUG_ON(vma->vm_flags & VM_NO_THP); + return true; +} + static void collapse_huge_page(struct mm_struct *mm, unsigned long address, struct page **hpage, @@ -1934,17 +1948,8 @@ static void collapse_huge_page(struct mm_struct *mm, hend = vma->vm_end & HPAGE_PMD_MASK; if (address < hstart || address + HPAGE_PMD_SIZE > hend) goto out; - - if ((!(vma->vm_flags & VM_HUGEPAGE) && !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE)) - goto out; - - if (!vma->anon_vma || vma->vm_ops) - goto out; - if (is_vma_temporary_stack(vma)) + if (!hugepage_vma_check(vma)) goto out; - VM_BUG_ON(vma->vm_flags & VM_NO_THP); - pmd = mm_find_pmd(mm, address); if (!pmd) goto out; @@ -2152,20 +2157,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, progress++; break; } - - if ((!(vma->vm_flags & VM_HUGEPAGE) && - !khugepaged_always()) || - (vma->vm_flags & VM_NOHUGEPAGE)) { - skip: + if (!hugepage_vma_check(vma)) { +skip: progress++; continue; } - if (!vma->anon_vma || vma->vm_ops) - goto skip; - if (is_vma_temporary_stack(vma)) - goto skip; - VM_BUG_ON(vma->vm_flags & VM_NO_THP); - hstart = (vma->vm_start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; hend = vma->vm_end & HPAGE_PMD_MASK; if (hstart >= hend) -- cgit v1.1 From b3092b3b734f146d96ca023a75cacf78078f96d5 Mon Sep 17 00:00:00 2001 From: Bob Liu Date: Tue, 11 Dec 2012 16:00:41 -0800 Subject: thp: cleanup: introduce mk_huge_pmd() Introduce mk_huge_pmd() to simplify the code Signed-off-by: Bob Liu Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Minchan Kim Cc: Ni zhan Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2600268..ea5fb93 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -606,6 +606,15 @@ static inline pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) return pmd; } +static inline pmd_t mk_huge_pmd(struct page *page, struct vm_area_struct *vma) +{ + pmd_t entry; + entry = mk_pmd(page, vma->vm_page_prot); + entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); + entry = pmd_mkhuge(entry); + return entry; +} + static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long haddr, pmd_t *pmd, @@ -629,9 +638,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pte_free(mm, pgtable); } else { pmd_t entry; - entry = mk_pmd(page, vma->vm_page_prot); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - entry = pmd_mkhuge(entry); + entry = mk_huge_pmd(page, vma); /* * The spinlocking to take the lru_lock inside * page_add_new_anon_rmap() acts as a full memory @@ -951,9 +958,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, } else { pmd_t entry; VM_BUG_ON(!PageHead(page)); - entry = mk_pmd(new_page, vma->vm_page_prot); - entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - entry = pmd_mkhuge(entry); + entry = mk_huge_pmd(new_page, vma); pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); set_pmd_at(mm, haddr, pmd, entry); @@ -2000,9 +2005,7 @@ static void collapse_huge_page(struct mm_struct *mm, __SetPageUptodate(new_page); pgtable = pmd_pgtable(_pmd); - _pmd = mk_pmd(new_page, vma->vm_page_prot); - _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); - _pmd = pmd_mkhuge(_pmd); + _pmd = mk_huge_pmd(new_page, vma); /* * spin_lock() below is not the equivalent of smp_wmb(), so -- cgit v1.1 From b023f46813cde6e3b8a8c24f432ff9c1fd8e9a64 Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Tue, 11 Dec 2012 16:00:45 -0800 Subject: memory-hotplug: skip HWPoisoned page when offlining pages hwpoisoned may be set when we offline a page by the sysfs interface /sys/devices/system/memory/soft_offline_page or /sys/devices/system/memory/hard_offline_page. If we don't clear this flag when onlining pages, this page can't be freed, and will not in free list. So we can't offline these pages again. So we should skip such page when offlining pages. Signed-off-by: Wen Congyang Cc: David Rientjes Cc: Jiang Liu Cc: Len Brown Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Christoph Lameter Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Yasuaki Ishimatsu Cc: Andi Kleen Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 2 +- mm/memory_hotplug.c | 5 +++-- mm/page_alloc.c | 27 +++++++++++++++++++++++---- mm/page_isolation.c | 27 ++++++++++++++++++++------- 4 files changed, 47 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8b20278..2c9fc73 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1385,7 +1385,7 @@ static int get_any_page(struct page *p, unsigned long pfn, int flags) * Isolate the page, so that it doesn't get reallocated if it * was free. */ - set_migratetype_isolate(p); + set_migratetype_isolate(p, true); /* * When the target page is a free hugepage, just remove it * from free hugepage list. diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index e4eeaca..0095d15 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -847,7 +847,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages, { int ret; long offlined = *(long *)data; - ret = test_pages_isolated(start_pfn, start_pfn + nr_pages); + ret = test_pages_isolated(start_pfn, start_pfn + nr_pages, true); offlined = nr_pages; if (!ret) *(long *)data += offlined; @@ -894,7 +894,8 @@ static int __ref __offline_pages(unsigned long start_pfn, nr_pages = end_pfn - start_pfn; /* set above range as isolated */ - ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); + ret = start_isolate_page_range(start_pfn, end_pfn, + MIGRATE_MOVABLE, true); if (ret) goto out; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a49b0ea..6f50cfe 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5616,7 +5616,8 @@ void set_pageblock_flags_group(struct page *page, unsigned long flags, * MIGRATE_MOVABLE block might include unmovable pages. It means you can't * expect this function should be exact. */ -bool has_unmovable_pages(struct zone *zone, struct page *page, int count) +bool has_unmovable_pages(struct zone *zone, struct page *page, int count, + bool skip_hwpoisoned_pages) { unsigned long pfn, iter, found; int mt; @@ -5651,6 +5652,13 @@ bool has_unmovable_pages(struct zone *zone, struct page *page, int count) continue; } + /* + * The HWPoisoned page may be not in buddy system, and + * page_count() is not 0. + */ + if (skip_hwpoisoned_pages && PageHWPoison(page)) + continue; + if (!PageLRU(page)) found++; /* @@ -5693,7 +5701,7 @@ bool is_pageblock_removable_nolock(struct page *page) zone->zone_start_pfn + zone->spanned_pages <= pfn) return false; - return !has_unmovable_pages(zone, page, 0); + return !has_unmovable_pages(zone, page, 0, true); } #ifdef CONFIG_CMA @@ -5864,7 +5872,8 @@ int alloc_contig_range(unsigned long start, unsigned long end, */ ret = start_isolate_page_range(pfn_max_align_down(start), - pfn_max_align_up(end), migratetype); + pfn_max_align_up(end), migratetype, + false); if (ret) return ret; @@ -5903,7 +5912,7 @@ int alloc_contig_range(unsigned long start, unsigned long end, } /* Make sure the range is really isolated. */ - if (test_pages_isolated(outer_start, end)) { + if (test_pages_isolated(outer_start, end, false)) { pr_warn("alloc_contig_range test_pages_isolated(%lx, %lx) failed\n", outer_start, end); ret = -EBUSY; @@ -6018,6 +6027,16 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) continue; } page = pfn_to_page(pfn); + /* + * The HWPoisoned page may be not in buddy system, and + * page_count() is not 0. + */ + if (unlikely(!PageBuddy(page) && PageHWPoison(page))) { + pfn++; + SetPageReserved(page); + continue; + } + BUG_ON(page_count(page)); BUG_ON(!PageBuddy(page)); order = page_order(page); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f2f5b48..9d2264e 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -30,7 +30,7 @@ static void restore_pageblock_isolate(struct page *page, int migratetype) zone->nr_pageblock_isolate--; } -int set_migratetype_isolate(struct page *page) +int set_migratetype_isolate(struct page *page, bool skip_hwpoisoned_pages) { struct zone *zone; unsigned long flags, pfn; @@ -66,7 +66,8 @@ int set_migratetype_isolate(struct page *page) * FIXME: Now, memory hotplug doesn't call shrink_slab() by itself. * We just check MOVABLE pages. */ - if (!has_unmovable_pages(zone, page, arg.pages_found)) + if (!has_unmovable_pages(zone, page, arg.pages_found, + skip_hwpoisoned_pages)) ret = 0; /* @@ -134,7 +135,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages) * Returns 0 on success and -EBUSY if any part of range cannot be isolated. */ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, - unsigned migratetype) + unsigned migratetype, bool skip_hwpoisoned_pages) { unsigned long pfn; unsigned long undo_pfn; @@ -147,7 +148,8 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, pfn < end_pfn; pfn += pageblock_nr_pages) { page = __first_valid_page(pfn, pageblock_nr_pages); - if (page && set_migratetype_isolate(page)) { + if (page && + set_migratetype_isolate(page, skip_hwpoisoned_pages)) { undo_pfn = pfn; goto undo; } @@ -190,7 +192,8 @@ int undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, * Returns 1 if all pages in the range are isolated. */ static int -__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) +__test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn, + bool skip_hwpoisoned_pages) { struct page *page; @@ -220,6 +223,14 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) else if (page_count(page) == 0 && get_freepage_migratetype(page) == MIGRATE_ISOLATE) pfn += 1; + else if (skip_hwpoisoned_pages && PageHWPoison(page)) { + /* + * The HWPoisoned page may be not in buddy + * system, and page_count() is not 0. + */ + pfn++; + continue; + } else break; } @@ -228,7 +239,8 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn) return 1; } -int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) +int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn, + bool skip_hwpoisoned_pages) { unsigned long pfn, flags; struct page *page; @@ -251,7 +263,8 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) /* Check all pages are free or Marked as ISOLATED */ zone = page_zone(page); spin_lock_irqsave(&zone->lock, flags); - ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn); + ret = __test_page_isolated_in_pageblock(start_pfn, end_pfn, + skip_hwpoisoned_pages); spin_unlock_irqrestore(&zone->lock, flags); return ret ? 0 : -EBUSY; } -- cgit v1.1 From 95a4774d055c72d96ab192a1c6675cbf4d513f71 Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Tue, 11 Dec 2012 16:00:47 -0800 Subject: memory-hotplug: update mce_bad_pages when removing the memory When we hotremove a memory device, we will free the memory to store struct page. If the page is hwpoisoned page, we should decrease mce_bad_pages. [akpm@linux-foundation.org: cleanup ifdefs] Signed-off-by: Wen Congyang Cc: David Rientjes Cc: Jiang Liu Cc: Len Brown Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Christoph Lameter Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Yasuaki Ishimatsu Cc: Dave Hansen Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index a83de2f..c7be019 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -771,6 +771,27 @@ out: return ret; } +#ifdef CONFIG_MEMORY_FAILURE +static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) +{ + int i; + + if (!memmap) + return; + + for (i = 0; i < PAGES_PER_SECTION; i++) { + if (PageHWPoison(&memmap[i])) { + atomic_long_sub(1, &mce_bad_pages); + ClearPageHWPoison(&memmap[i]); + } + } +} +#else +static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) +{ +} +#endif + void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) { struct page *memmap = NULL; @@ -784,6 +805,7 @@ void sparse_remove_one_section(struct zone *zone, struct mem_section *ms) ms->pageblock_flags = NULL; } + clear_hwpoisoned_pages(memmap, PAGES_PER_SECTION); free_section_usemap(memmap, usemap); } #endif -- cgit v1.1 From 7c72eb327282ee7fcadc5ef227c075cf72467ba7 Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Tue, 11 Dec 2012 16:00:49 -0800 Subject: memory-hotplug: auto offline page_cgroup when onlining memory block failed When a memory block is onlined, we will try allocate memory on that node to store page_cgroup. If onlining the memory block failed, we don't offline the page cgroup, and we have no chance to offline this page cgroup unless the memory block is onlined successfully again. It will cause that we can't hot-remove the memory device on that node, because some memory is used to store page cgroup. If onlining the memory block is failed, there is no need to stort page cgroup for this memory. So auto offline page_cgroup when onlining memory block failed. Signed-off-by: Wen Congyang Cc: David Rientjes Cc: Jiang Liu Cc: Len Brown Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Christoph Lameter Cc: Minchan Kim Acked-by: KOSAKI Motohiro Cc: Yasuaki Ishimatsu Cc: Dave Hansen Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_cgroup.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'mm') diff --git a/mm/page_cgroup.c b/mm/page_cgroup.c index 5ddad0c..44db00e 100644 --- a/mm/page_cgroup.c +++ b/mm/page_cgroup.c @@ -251,6 +251,9 @@ static int __meminit page_cgroup_callback(struct notifier_block *self, mn->nr_pages, mn->status_change_nid); break; case MEM_CANCEL_ONLINE: + offline_page_cgroup(mn->start_pfn, + mn->nr_pages, mn->status_change_nid); + break; case MEM_GOING_OFFLINE: break; case MEM_ONLINE: -- cgit v1.1 From 97d0da2204ed9e34d9d42c2024c5bea5543f13c6 Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Tue, 11 Dec 2012 16:00:52 -0800 Subject: memory-hotplug: fix NR_FREE_PAGES mismatch NR_FREE_PAGES will be wrong after offlining pages. We add/dec NR_FREE_PAGES like this now: 1. move all pages in buddy system to MIGRATE_ISOLATE, and dec NR_FREE_PAGES 2. don't add NR_FREE_PAGES when it is freed and the migratetype is MIGRATE_ISOLATE 3. dec NR_FREE_PAGES when offlining isolated pages. 4. add NR_FREE_PAGES when undoing isolate pages. When we come to step 3, all pages are in MIGRATE_ISOLATE list, and NR_FREE_PAGES are right. When we come to step4, all pages are not in buddy system, so we don't change NR_FREE_PAGES in this step, but we change NR_FREE_PAGES in step3. So NR_FREE_PAGES is wrong after offlining pages. So there is no need to change NR_FREE_PAGES in step3. This patch also fixs a problem in step2: if the migratetype is MIGRATE_ISOLATE, we should not add NR_FRR_PAGES when we remove pages from pcppages. Signed-off-by: Wen Congyang Cc: David Rientjes Cc: Jiang Liu Cc: Len Brown Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Christoph Lameter Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Yasuaki Ishimatsu Cc: Dave Hansen Cc: Mel Gorman Cc: Jianguo Wu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6f50cfe..4dba04f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -667,11 +667,13 @@ static void free_pcppages_bulk(struct zone *zone, int count, /* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */ __free_one_page(page, zone, 0, mt); trace_mm_page_pcpu_drain(page, 0, mt); - if (is_migrate_cma(mt)) - __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); + if (likely(get_pageblock_migratetype(page) != MIGRATE_ISOLATE)) { + __mod_zone_page_state(zone, NR_FREE_PAGES, 1); + if (is_migrate_cma(mt)) + __mod_zone_page_state(zone, NR_FREE_CMA_PAGES, 1); + } } while (--to_free && --batch_free && !list_empty(list)); } - __mod_zone_page_state(zone, NR_FREE_PAGES, count); spin_unlock(&zone->lock); } @@ -6047,8 +6049,6 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) list_del(&page->lru); rmv_page_order(page); zone->free_area[order].nr_free--; - __mod_zone_page_state(zone, NR_FREE_PAGES, - - (1UL << order)); for (i = 0; i < (1 << order); i++) SetPageReserved((page+i)); pfn += (1 << order); -- cgit v1.1 From 8732794b166196cc501c2ddd9e7c97cf45ab64c5 Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Tue, 11 Dec 2012 16:00:56 -0800 Subject: numa: convert static memory to dynamically allocated memory for per node device We use a static array to store struct node. In many cases, we don't have too many nodes, and some memory will be unused. Convert it to per-device dynamically allocated memory. Signed-off-by: Wen Congyang Cc: David Rientjes Cc: Jiang Liu Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Yasuaki Ishimatsu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 59a0059..1ef2cd4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1800,7 +1800,7 @@ static void hugetlb_unregister_all_nodes(void) * remove hstate attributes from any nodes that have them. */ for (nid = 0; nid < nr_node_ids; nid++) - hugetlb_unregister_node(&node_devices[nid]); + hugetlb_unregister_node(node_devices[nid]); } /* @@ -1845,7 +1845,7 @@ static void hugetlb_register_all_nodes(void) int nid; for_each_node_state(nid, N_HIGH_MEMORY) { - struct node *node = &node_devices[nid]; + struct node *node = node_devices[nid]; if (node->dev.id == nid) hugetlb_register_node(node); } -- cgit v1.1 From 3ac19f8efe26451cacac31d0be34fa9c51114c2a Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Tue, 11 Dec 2012 16:00:59 -0800 Subject: memory-hotplug, mm/sparse.c: clear the memory to store struct page If sparse memory vmemmap is enabled, we can't free the memory to store struct page when a memory device is hotremoved, because we may store struct page in the memory to manage the memory which doesn't belong to this memory device. When we hotadded this memory device again, we will reuse this memory to store struct page, and struct page may contain some obsolete information, and we will get bad-page state: init_memory_mapping: [mem 0x80000000-0x9fffffff] Built 2 zonelists in Node order, mobility grouping on. Total pages: 547617 Policy zone: Normal BUG: Bad page state in process bash pfn:9b6dc page:ffffea0002200020 count:0 mapcount:0 mapping: (null) index:0xfdfdfdfdfdfdfdfd page flags: 0x2fdfdfdfd5df9fd(locked|referenced|uptodate|dirty|lru|active|slab|owner_priv_1|private|private_2|writeback|head|tail|swapcache|reclaim|swapbacked|unevictable|uncached|compound_lock) Modules linked in: netconsole acpiphp pci_hotplug acpi_memhotplug loop kvm_amd kvm microcode tpm_tis tpm tpm_bios evdev psmouse serio_raw i2c_piix4 i2c_core parport_pc parport processor button thermal_sys ext3 jbd mbcache sg sr_mod cdrom ata_generic virtio_net ata_piix virtio_blk libata virtio_pci virtio_ring virtio scsi_mod Pid: 988, comm: bash Not tainted 3.6.0-rc7-guest #12 Call Trace: [] ? bad_page+0xb0/0x100 [] ? free_pages_prepare+0xb3/0x100 [] ? free_hot_cold_page+0x48/0x1a0 [] ? online_pages_range+0x68/0xa0 [] ? __online_page_increment_counters+0x10/0x10 [] ? walk_system_ram_range+0x101/0x110 [] ? online_pages+0x1a5/0x2b0 [] ? __memory_block_change_state+0x20d/0x270 [] ? store_mem_state+0xb6/0xf0 [] ? sysfs_write_file+0xd2/0x160 [] ? vfs_write+0xaa/0x160 [] ? sys_write+0x47/0x90 [] ? async_page_fault+0x25/0x30 [] ? system_call_fastpath+0x16/0x1b Disabling lock debugging due to kernel taint This patch clears the memory to store struct page to avoid unexpected error. Signed-off-by: Wen Congyang Cc: David Rientjes Cc: Jiang Liu Cc: Minchan Kim Acked-by: KOSAKI Motohiro Cc: Yasuaki Ishimatsu Reported-by: Vasilis Liaskovitis Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/sparse.c b/mm/sparse.c index c7be019..6b5fb76 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -638,7 +638,6 @@ static struct page *__kmalloc_section_memmap(unsigned long nr_pages) got_map_page: ret = (struct page *)pfn_to_kaddr(page_to_pfn(page)); got_map_ptr: - memset(ret, 0, memmap_size); return ret; } @@ -758,6 +757,8 @@ int __meminit sparse_add_one_section(struct zone *zone, unsigned long start_pfn, goto out; } + memset(memmap, 0, sizeof(struct page) * nr_pages); + ms->section_mem_map |= SECTION_MARKED_PRESENT; ret = sparse_init_one_section(ms, section_nr, memmap, usemap); -- cgit v1.1 From 6dcd73d7011ba9046f9b98e7f7c9d958f5810e6b Mon Sep 17 00:00:00 2001 From: Wen Congyang Date: Tue, 11 Dec 2012 16:01:01 -0800 Subject: memory-hotplug: allocate zone's pcp before onlining pages We use __free_page() to put a page to buddy system when onlining pages. __free_page() will store NR_FREE_PAGES in zone's pcp.vm_stat_diff, so we should allocate zone's pcp before onlining pages, otherwise we will lose some free pages. [mhocko@suse.cz: make zone_pcp_reset independent of MEMORY_HOTREMOVE] Signed-off-by: Wen Congyang Cc: David Rientjes Cc: Jiang Liu Cc: Len Brown Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Christoph Lameter Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Yasuaki Ishimatsu Cc: Dave Hansen Cc: Mel Gorman Signed-off-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 8 ++++++-- mm/page_alloc.c | 2 +- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 0095d15..ec2f199 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -498,12 +498,16 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) * So, zonelist must be updated after online. */ mutex_lock(&zonelists_mutex); - if (!populated_zone(zone)) + if (!populated_zone(zone)) { need_zonelists_rebuild = 1; + build_all_zonelists(NULL, zone); + } ret = walk_system_ram_range(pfn, nr_pages, &onlined_pages, online_pages_range); if (ret) { + if (need_zonelists_rebuild) + zone_pcp_reset(zone); mutex_unlock(&zonelists_mutex); printk(KERN_DEBUG "online_pages [mem %#010llx-%#010llx] failed\n", (unsigned long long) pfn << PAGE_SHIFT, @@ -519,7 +523,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) if (onlined_pages) { node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); if (need_zonelists_rebuild) - build_all_zonelists(NULL, zone); + build_all_zonelists(NULL, NULL); else zone_pcp_update(zone); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4dba04f0..5a7b761 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5983,7 +5983,6 @@ void __meminit zone_pcp_update(struct zone *zone) } #endif -#ifdef CONFIG_MEMORY_HOTREMOVE void zone_pcp_reset(struct zone *zone) { unsigned long flags; @@ -6003,6 +6002,7 @@ void zone_pcp_reset(struct zone *zone) local_irq_restore(flags); } +#ifdef CONFIG_MEMORY_HOTREMOVE /* * All pages in the range must be isolated before calling this. */ -- cgit v1.1 From d9713679dbd2a6ecb840cd5b65a3ec555c1ec3d4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 11 Dec 2012 16:01:03 -0800 Subject: memory_hotplug: fix possible incorrect node_states[N_NORMAL_MEMORY] Currently memory_hotplug only manages the node_states[N_HIGH_MEMORY], it forgets to manage node_states[N_NORMAL_MEMORY]. This may cause node_states[N_NORMAL_MEMORY] to become incorrect. Example, if a node is empty before online, and we online a memory which is in ZONE_NORMAL. And after online, node_states[N_HIGH_MEMORY] is correct, but node_states[N_NORMAL_MEMORY] is incorrect, the online code doesn't set the new online node to node_states[N_NORMAL_MEMORY]. The same thing will happen when offlining (the offline code doesn't clear the node from node_states[N_NORMAL_MEMORY] when needed). Some memory managment code depends node_states[N_NORMAL_MEMORY], so we have to fix up the node_states[N_NORMAL_MEMORY]. We add node_states_check_changes_online() and node_states_check_changes_offline() to detect whether node_states[N_HIGH_MEMORY] and node_states[N_NORMAL_MEMORY] are changed while hotpluging. Also add @status_change_nid_normal to struct memory_notify, thus the memory hotplug callbacks know whether the node_states[N_NORMAL_MEMORY] are changed. (We can add a @flags and reuse @status_change_nid instead of introducing @status_change_nid_normal, but it will add much more complexity in memory hotplug callback in every subsystem. So introducing @status_change_nid_normal is better and it doesn't change the sematics of @status_change_nid) Signed-off-by: Lai Jiangshan Cc: David Rientjes Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Yasuaki Ishimatsu Cc: Rob Landley Cc: Jiang Liu Cc: Kay Sievers Cc: Greg Kroah-Hartman Cc: Mel Gorman Cc: Wen Congyang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 136 +++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 120 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index ec2f199..7219560 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -460,6 +460,53 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, return 0; } +/* check which state of node_states will be changed when online memory */ +static void node_states_check_changes_online(unsigned long nr_pages, + struct zone *zone, struct memory_notify *arg) +{ + int nid = zone_to_nid(zone); + enum zone_type zone_last = ZONE_NORMAL; + + /* + * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes + * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. + * + * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes + * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. + */ + if (N_HIGH_MEMORY == N_NORMAL_MEMORY) + zone_last = ZONE_MOVABLE; + + /* + * if the memory to be online is in a zone of 0...zone_last, and + * the zones of 0...zone_last don't have memory before online, we will + * need to set the node to node_states[N_NORMAL_MEMORY] after + * the memory is online. + */ + if (zone_idx(zone) <= zone_last && !node_state(nid, N_NORMAL_MEMORY)) + arg->status_change_nid_normal = nid; + else + arg->status_change_nid_normal = -1; + + /* + * if the node don't have memory befor online, we will need to + * set the node to node_states[N_HIGH_MEMORY] after the memory + * is online. + */ + if (!node_state(nid, N_HIGH_MEMORY)) + arg->status_change_nid = nid; + else + arg->status_change_nid = -1; +} + +static void node_states_set_node(int node, struct memory_notify *arg) +{ + if (arg->status_change_nid_normal >= 0) + node_set_state(node, N_NORMAL_MEMORY); + + node_set_state(node, N_HIGH_MEMORY); +} + int __ref online_pages(unsigned long pfn, unsigned long nr_pages) { @@ -471,13 +518,18 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) struct memory_notify arg; lock_memory_hotplug(); + /* + * This doesn't need a lock to do pfn_to_page(). + * The section can't be removed here because of the + * memory_block->state_mutex. + */ + zone = page_zone(pfn_to_page(pfn)); + arg.start_pfn = pfn; arg.nr_pages = nr_pages; - arg.status_change_nid = -1; + node_states_check_changes_online(nr_pages, zone, &arg); nid = page_to_nid(pfn_to_page(pfn)); - if (node_present_pages(nid) == 0) - arg.status_change_nid = nid; ret = memory_notify(MEM_GOING_ONLINE, &arg); ret = notifier_to_errno(ret); @@ -487,12 +539,6 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) return ret; } /* - * This doesn't need a lock to do pfn_to_page(). - * The section can't be removed here because of the - * memory_block->state_mutex. - */ - zone = page_zone(pfn_to_page(pfn)); - /* * If this zone is not populated, then it is not in zonelist. * This means the page allocator ignores this zone. * So, zonelist must be updated after online. @@ -521,7 +567,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) zone->present_pages += onlined_pages; zone->zone_pgdat->node_present_pages += onlined_pages; if (onlined_pages) { - node_set_state(zone_to_nid(zone), N_HIGH_MEMORY); + node_states_set_node(zone_to_nid(zone), &arg); if (need_zonelists_rebuild) build_all_zonelists(NULL, NULL); else @@ -871,6 +917,67 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) return offlined; } +/* check which state of node_states will be changed when offline memory */ +static void node_states_check_changes_offline(unsigned long nr_pages, + struct zone *zone, struct memory_notify *arg) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long present_pages = 0; + enum zone_type zt, zone_last = ZONE_NORMAL; + + /* + * If we have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes + * which have 0...ZONE_NORMAL, set zone_last to ZONE_NORMAL. + * + * If we don't have HIGHMEM, node_states[N_NORMAL_MEMORY] contains nodes + * which have 0...ZONE_MOVABLE, set zone_last to ZONE_MOVABLE. + */ + if (N_HIGH_MEMORY == N_NORMAL_MEMORY) + zone_last = ZONE_MOVABLE; + + /* + * check whether node_states[N_NORMAL_MEMORY] will be changed. + * If the memory to be offline is in a zone of 0...zone_last, + * and it is the last present memory, 0...zone_last will + * become empty after offline , thus we can determind we will + * need to clear the node from node_states[N_NORMAL_MEMORY]. + */ + for (zt = 0; zt <= zone_last; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + if (zone_idx(zone) <= zone_last && nr_pages >= present_pages) + arg->status_change_nid_normal = zone_to_nid(zone); + else + arg->status_change_nid_normal = -1; + + /* + * node_states[N_HIGH_MEMORY] contains nodes which have 0...ZONE_MOVABLE + */ + zone_last = ZONE_MOVABLE; + + /* + * check whether node_states[N_HIGH_MEMORY] will be changed + * If we try to offline the last present @nr_pages from the node, + * we can determind we will need to clear the node from + * node_states[N_HIGH_MEMORY]. + */ + for (; zt <= zone_last; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + if (nr_pages >= present_pages) + arg->status_change_nid = zone_to_nid(zone); + else + arg->status_change_nid = -1; +} + +static void node_states_clear_node(int node, struct memory_notify *arg) +{ + if (arg->status_change_nid_normal >= 0) + node_clear_state(node, N_NORMAL_MEMORY); + + if ((N_HIGH_MEMORY != N_NORMAL_MEMORY) && + (arg->status_change_nid >= 0)) + node_clear_state(node, N_HIGH_MEMORY); +} + static int __ref __offline_pages(unsigned long start_pfn, unsigned long end_pfn, unsigned long timeout) { @@ -905,9 +1012,7 @@ static int __ref __offline_pages(unsigned long start_pfn, arg.start_pfn = start_pfn; arg.nr_pages = nr_pages; - arg.status_change_nid = -1; - if (nr_pages >= node_present_pages(node)) - arg.status_change_nid = node; + node_states_check_changes_offline(nr_pages, zone, &arg); ret = memory_notify(MEM_GOING_OFFLINE, &arg); ret = notifier_to_errno(ret); @@ -980,10 +1085,9 @@ repeat: } else zone_pcp_update(zone); - if (!node_present_pages(node)) { - node_clear_state(node, N_HIGH_MEMORY); + node_states_clear_node(node, &arg); + if (arg.status_change_nid >= 0) kswapd_stop(node); - } vm_total_pages = nr_free_pagecache_pages(); writeback_set_ratelimit(); -- cgit v1.1 From b9d5ab2562eceeada5e4837a621b6260574dd11d Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 11 Dec 2012 16:01:05 -0800 Subject: slub, hotplug: ignore unrelated node's hot-adding and hot-removing SLUB only focuses on the nodes which have normal memory and it ignores the other node's hot-adding and hot-removing. Aka: if some memory of a node which has no onlined memory is online, but this new memory onlined is not normal memory (for example, highmem), we should not allocate kmem_cache_node for SLUB. And if the last normal memory is offlined, but the node still has memory, we should remove kmem_cache_node for that node. (The current code delays it when all of the memory is offlined) So we only do something when marg->status_change_nid_normal > 0. marg->status_change_nid is not suitable here. The same problem doesn't exist in SLAB, because SLAB allocates kmem_list3 for every node even the node don't have normal memory, SLAB tolerates kmem_list3 on alien nodes. SLUB only focuses on the nodes which have normal memory, it don't tolerate alien kmem_cache_node. The patch makes SLUB become self-compatible and avoids WARNs and BUGs in rare conditions. Signed-off-by: Lai Jiangshan Cc: David Rientjes Cc: Minchan Kim Cc: KOSAKI Motohiro Cc: Yasuaki Ishimatsu Cc: Rob Landley Cc: Andrew Morton Cc: Jiang Liu Cc: Kay Sievers Cc: Greg Kroah-Hartman Cc: Mel Gorman Cc: Wen Congyang Acked-by: Christoph Lameter Cc: Pekka Enberg Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/slub.c b/mm/slub.c index a0d6984..487f0bd 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -3573,7 +3573,7 @@ static void slab_mem_offline_callback(void *arg) struct memory_notify *marg = arg; int offline_node; - offline_node = marg->status_change_nid; + offline_node = marg->status_change_nid_normal; /* * If the node still has available memory. we need kmem_cache_node @@ -3606,7 +3606,7 @@ static int slab_mem_going_online_callback(void *arg) struct kmem_cache_node *n; struct kmem_cache *s; struct memory_notify *marg = arg; - int nid = marg->status_change_nid; + int nid = marg->status_change_nid_normal; int ret = 0; /* -- cgit v1.1 From 712cd386fdc983d318fecf302a2a9cb8e9de90c9 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Tue, 11 Dec 2012 16:01:07 -0800 Subject: mm/memory_hotplug.c: update start_pfn in zone and pg_data when spanned_pages == 0. If we hot-remove memory only and leave the cpus alive, the corresponding node will not be removed. But the node_start_pfn and node_spanned_pages in pg_data will be reset to 0. In this case, when we hot-add the memory back next time, the node_start_pfn will always be 0 because no pfn is less than 0. After that, if we hot-remove the memory again, it will cause kernel panic in function find_biggest_section_pfn() when it tries to scan all the pfns. The zone will also have the same problem. This patch sets start_pfn to the start_pfn of the section being added when spanned_pages of the zone or pg_data is 0. ---How to reproduce--- 1. hot-add a container with some memory and cpus; 2. hot-remove the container's memory, and leave cpus there; 3. hot-add these memory again; 4. hot-remove them again; then, the kernel will panic. ---Call trace--- BUG: unable to handle kernel paging request at 00000fff82a8cc38 IP: [] find_biggest_section_pfn+0xe5/0x180 ...... Call Trace: [] __remove_zone+0x184/0x1b0 [] __remove_section+0x8c/0xb0 [] __remove_pages+0xe7/0x120 [] arch_remove_memory+0x2c/0x80 [] remove_memory+0x56/0x90 [] acpi_memory_device_remove_memory+0x48/0x73 [] acpi_memory_device_notify+0x153/0x274 [] acpi_ev_notify_dispatch+0x41/0x5f [] acpi_os_execute_deferred+0x27/0x34 [] process_one_work+0x219/0x680 [] worker_thread+0x12e/0x320 [] kthread+0xc6/0xd0 [] kernel_thread_helper+0x4/0x10 ...... ---[ end trace 96d845dbf33fee11 ]--- Signed-off-by: Tang Chen Cc: Yasuaki Ishimatsu Cc: Wen Congyang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 7219560..571130e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -205,7 +205,7 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, zone_span_writelock(zone); old_zone_end_pfn = zone->zone_start_pfn + zone->spanned_pages; - if (start_pfn < zone->zone_start_pfn) + if (!zone->spanned_pages || start_pfn < zone->zone_start_pfn) zone->zone_start_pfn = start_pfn; zone->spanned_pages = max(old_zone_end_pfn, end_pfn) - @@ -220,7 +220,7 @@ static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long old_pgdat_end_pfn = pgdat->node_start_pfn + pgdat->node_spanned_pages; - if (start_pfn < pgdat->node_start_pfn) + if (!pgdat->node_spanned_pages || start_pfn < pgdat->node_start_pfn) pgdat->node_start_pfn = start_pfn; pgdat->node_spanned_pages = max(old_pgdat_end_pfn, end_pfn) - -- cgit v1.1 From e749eb95531ac8349df47f8d46ce2641dcb16589 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 11 Dec 2012 16:01:09 -0800 Subject: mm: add comment on storage key dirty bit semantics Add comments that dirty bit in storage key gets set whenever page content is changed. Hopefully if someone will use this function, he'll have a look at one of the two places where we comment on this. Signed-off-by: Jan Kara Cc: Martin Schwidefsky Cc: Mel Gorman Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/rmap.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/rmap.c b/mm/rmap.c index 46823fb..cf7e99a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1151,9 +1151,11 @@ void page_remove_rmap(struct page *page) * containing the swap entry, but page not yet written to swap. * * And we can skip it on file pages, so long as the filesystem - * participates in dirty tracking; but need to catch shm and tmpfs - * and ramfs pages which have been modified since creation by read - * fault. + * participates in dirty tracking (note that this is not only an + * optimization but also solves problems caused by dirty flag in + * storage key getting set by a write from inside kernel); but need to + * catch shm and tmpfs and ramfs pages which have been modified since + * creation by read fault. * * Note that mapping must be decided above, before decrementing * mapcount (which luckily provides a barrier): once page is unmapped, -- cgit v1.1 From e9868505987a03a26a3979f27b82911ccc003752 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 11 Dec 2012 16:01:10 -0800 Subject: mm,vmscan: only evict file pages when we have plenty If we have more inactive file pages than active file pages, we skip scanning the active file pages altogether, with the idea that we do not want to evict the working set when there is plenty of streaming IO in the cache. However, the code forgot to also skip scanning anonymous pages in that situation. That leads to the curious situation of keeping the active file pages protected from being paged out when there are lots of inactive file pages, while still scanning and evicting anonymous pages. This patch fixes that situation, by only evicting file pages when we have plenty of them and most are inactive. [akpm@linux-foundation.org: adjust comment layout] Signed-off-by: Rik van Riel Cc: Mel Gorman Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index a1ce17f..5394731 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1679,13 +1679,24 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, if (global_reclaim(sc)) { free = zone_page_state(zone, NR_FREE_PAGES); - /* If we have very few page cache pages, - force-scan anon pages. */ if (unlikely(file + free <= high_wmark_pages(zone))) { + /* + * If we have very few page cache pages, force-scan + * anon pages. + */ fraction[0] = 1; fraction[1] = 0; denominator = 1; goto out; + } else if (!inactive_file_is_low_global(zone)) { + /* + * There is enough inactive page cache, do not + * reclaim anything from the working set right now. + */ + fraction[0] = 0; + fraction[1] = 1; + denominator = 1; + goto out; } } -- cgit v1.1 From cf0cac0a09341549dedabcfc2a66dcbc2eaaf2b9 Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 11 Dec 2012 16:01:13 -0800 Subject: mm: refactor reinsert of swap_info in sys_swapoff() The block within sys_swapoff() which re-inserts the swap_info into the swap_list in case of failure of try_to_unuse() reads a few values outside the swap_lock. While this is safe at that point, it is subtle code. Simplify the code by moving the reading of these values to a separate function, refactoring it a bit so they are read from within the swap_lock. This is easier to understand, and matches better the way it worked before I unified the insertion of the swap_info from both sys_swapon and sys_swapoff. This change should make no functional difference. The only real change is moving the read of two or three structure fields to within the lock (frontswap_map_get() is nothing more than a read of p->frontswap_map). Signed-off-by: Cesar Eduardo Barros Cc: Konrad Rzeszutek Wilk Cc: Dan Magenheimer Cc: Mel Gorman Cc: Rik van Riel Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index f91a255..27a52b7 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1443,13 +1443,12 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) return generic_swapfile_activate(sis, swap_file, span); } -static void enable_swap_info(struct swap_info_struct *p, int prio, +static void _enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, unsigned long *frontswap_map) { int i, prev; - spin_lock(&swap_lock); if (prio >= 0) p->prio = prio; else @@ -1473,6 +1472,21 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, else swap_info[prev]->next = p->type; frontswap_init(p->type); +} + +static void enable_swap_info(struct swap_info_struct *p, int prio, + unsigned char *swap_map, + unsigned long *frontswap_map) +{ + spin_lock(&swap_lock); + _enable_swap_info(p, prio, swap_map, frontswap_map); + spin_unlock(&swap_lock); +} + +static void reinsert_swap_info(struct swap_info_struct *p) +{ + spin_lock(&swap_lock); + _enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); spin_unlock(&swap_lock); } @@ -1548,14 +1562,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); if (err) { - /* - * reading p->prio and p->swap_map outside the lock is - * safe here because only sys_swapon and sys_swapoff - * change them, and there can be no other sys_swapon or - * sys_swapoff for this swap_info_struct at this point. - */ /* re-insert swap space back into swap_list */ - enable_swap_info(p, p->prio, p->swap_map, frontswap_map_get(p)); + reinsert_swap_info(p); goto out_dput; } -- cgit v1.1 From 6555bc035731eab76c0901925034465d3ad2099c Mon Sep 17 00:00:00 2001 From: Cesar Eduardo Barros Date: Tue, 11 Dec 2012 16:01:14 -0800 Subject: mm: do not call frontswap_init() during swapoff The call to frontswap_init() was added within enable_swap_info(), which was called not only during sys_swapon, but also to reinsert the swap_info into the swap_list in case of failure of try_to_unuse() within sys_swapoff. This means that frontswap_init() might be called more than once for the same swap area. While as far as I could see no frontswap implementation has any problem with it (and in fact, all the ones I found ignore the parameter passed to frontswap_init), this could change in the future. To prevent future problems, move the call to frontswap_init() to outside the code shared between sys_swapon and sys_swapoff. Signed-off-by: Cesar Eduardo Barros Cc: Konrad Rzeszutek Wilk Acked-by: Dan Magenheimer Cc: Mel Gorman Cc: Rik van Riel Cc: KAMEZAWA Hiroyuki Cc: Johannes Weiner Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 27a52b7..0fbb452 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1471,7 +1471,6 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, swap_list.head = swap_list.next = p->type; else swap_info[prev]->next = p->type; - frontswap_init(p->type); } static void enable_swap_info(struct swap_info_struct *p, int prio, @@ -1480,6 +1479,7 @@ static void enable_swap_info(struct swap_info_struct *p, int prio, { spin_lock(&swap_lock); _enable_swap_info(p, prio, swap_map, frontswap_map); + frontswap_init(p->type); spin_unlock(&swap_lock); } -- cgit v1.1 From 4de22c0584fb0566487b2cba5cdfbce346b18402 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 11 Dec 2012 16:01:17 -0800 Subject: mm, highmem: use PKMAP_NR() to calculate an index of pkmap To calculate an index of pkmap, using PKMAP_NR() is more understandable and maintainable, so change it. Signed-off-by: Joonsoo Kim Cc: Mel Gorman Reviewed-by: Minchan Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index 2da13a5..2576a71 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -99,7 +99,7 @@ struct page *kmap_to_page(void *vaddr) unsigned long addr = (unsigned long)vaddr; if (addr >= PKMAP_ADDR(0) && addr < PKMAP_ADDR(LAST_PKMAP)) { - int i = (addr - PKMAP_ADDR(0)) >> PAGE_SHIFT; + int i = PKMAP_NR(addr); return pte_page(pkmap_page_table[i]); } -- cgit v1.1 From cc33a303f1c155cf0147964586bb80fa732d8a21 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 11 Dec 2012 16:01:20 -0800 Subject: mm, highmem: remove useless pool_lock The pool_lock protects the page_address_pool from concurrent access. But, access to the page_address_pool is already protected by kmap_lock. So remove it. Signed-off-by: Joonsoo Kim Cc: Mel Gorman Reviewed-by: Minchan Kin Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index 2576a71..f0f0f1d 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -328,7 +328,6 @@ struct page_address_map { * page_address_map freelist, allocated from page_address_maps. */ static struct list_head page_address_pool; /* freelist */ -static spinlock_t pool_lock; /* protects page_address_pool */ /* * Hash table bucket @@ -395,11 +394,9 @@ void set_page_address(struct page *page, void *virtual) if (virtual) { /* Add */ BUG_ON(list_empty(&page_address_pool)); - spin_lock_irqsave(&pool_lock, flags); pam = list_entry(page_address_pool.next, struct page_address_map, list); list_del(&pam->list); - spin_unlock_irqrestore(&pool_lock, flags); pam->page = page; pam->virtual = virtual; @@ -413,9 +410,7 @@ void set_page_address(struct page *page, void *virtual) if (pam->page == page) { list_del(&pam->list); spin_unlock_irqrestore(&pas->lock, flags); - spin_lock_irqsave(&pool_lock, flags); list_add_tail(&pam->list, &page_address_pool); - spin_unlock_irqrestore(&pool_lock, flags); goto done; } } @@ -438,7 +433,6 @@ void __init page_address_init(void) INIT_LIST_HEAD(&page_address_htable[i].lh); spin_lock_init(&page_address_htable[i].lock); } - spin_lock_init(&pool_lock); } #endif /* defined(CONFIG_HIGHMEM) && !defined(WANT_PAGE_VIRTUAL) */ -- cgit v1.1 From a354e2c84eeebcd7d7bbdd71216895b8e9866b5c Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 11 Dec 2012 16:01:23 -0800 Subject: mm, highmem: remove page_address_pool list We can find free page_address_map instance without the page_address_pool. So remove it. Signed-off-by: Joonsoo Kim Cc: Mel Gorman Reviewed-by: Minchan Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index f0f0f1d..4d6f96c 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -324,10 +324,7 @@ struct page_address_map { struct list_head list; }; -/* - * page_address_map freelist, allocated from page_address_maps. - */ -static struct list_head page_address_pool; /* freelist */ +static struct page_address_map page_address_maps[LAST_PKMAP]; /* * Hash table bucket @@ -392,12 +389,7 @@ void set_page_address(struct page *page, void *virtual) pas = page_slot(page); if (virtual) { /* Add */ - BUG_ON(list_empty(&page_address_pool)); - - pam = list_entry(page_address_pool.next, - struct page_address_map, list); - list_del(&pam->list); - + pam = &page_address_maps[PKMAP_NR((unsigned long)virtual)]; pam->page = page; pam->virtual = virtual; @@ -410,7 +402,6 @@ void set_page_address(struct page *page, void *virtual) if (pam->page == page) { list_del(&pam->list); spin_unlock_irqrestore(&pas->lock, flags); - list_add_tail(&pam->list, &page_address_pool); goto done; } } @@ -420,15 +411,10 @@ done: return; } -static struct page_address_map page_address_maps[LAST_PKMAP]; - void __init page_address_init(void) { int i; - INIT_LIST_HEAD(&page_address_pool); - for (i = 0; i < ARRAY_SIZE(page_address_maps); i++) - list_add(&page_address_maps[i].list, &page_address_pool); for (i = 0; i < ARRAY_SIZE(page_address_htable); i++) { INIT_LIST_HEAD(&page_address_htable[i].lh); spin_lock_init(&page_address_htable[i].lock); -- cgit v1.1 From eb2db439a3203ae86c35ad277ac4a3268a94baa1 Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 11 Dec 2012 16:01:24 -0800 Subject: mm, highmem: get virtual address of the page using PKMAP_ADDR() In flush_all_zero_pkmaps(), we have an index of the pkmap associated with the page. Using this index, we can simply get virtual address of the page. So change it. Signed-off-by: Joonsoo Kim Cc: Mel Gorman Reviewed-by: Minchan Kim Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/highmem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/highmem.c b/mm/highmem.c index 4d6f96c..d999077 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -137,8 +137,7 @@ static void flush_all_zero_pkmaps(void) * So no dangers, even with speculative execution. */ page = pte_page(pkmap_page_table[i]); - pte_clear(&init_mm, (unsigned long)page_address(page), - &pkmap_page_table[i]); + pte_clear(&init_mm, PKMAP_ADDR(i), &pkmap_page_table[i]); set_page_address(page, NULL); need_flush = 1; -- cgit v1.1 From a1dd450bcb1a05e8218b9aac0ee36f8755d8a140 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Tue, 11 Dec 2012 16:01:27 -0800 Subject: mm: thp: set the accessed flag for old pages on access fault On x86 memory accesses to pages without the ACCESSED flag set result in the ACCESSED flag being set automatically. With the ARM architecture a page access fault is raised instead (and it will continue to be raised until the ACCESSED flag is set for the appropriate PTE/PMD). For normal memory pages, handle_pte_fault will call pte_mkyoung (effectively setting the ACCESSED flag). For transparent huge pages, pmd_mkyoung will only be called for a write fault. This patch ensures that faults on transparent hugepages which do not result in a CoW update the access flags for the faulting pmd. Signed-off-by: Will Deacon Cc: Chris Metcalf Acked-by: Kirill A. Shutemov Cc: Andrea Arcangeli Acked-by: Johannes Weiner Cc: Ni zhan Chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 22 ++++++++++++++++++++++ mm/memory.c | 8 ++++++-- 2 files changed, 28 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ea5fb93..5f902e2 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -784,6 +784,28 @@ out: return ret; } +void huge_pmd_set_accessed(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, + pmd_t *pmd, pmd_t orig_pmd, + int dirty) +{ + pmd_t entry; + unsigned long haddr; + + spin_lock(&mm->page_table_lock); + if (unlikely(!pmd_same(*pmd, orig_pmd))) + goto unlock; + + entry = pmd_mkyoung(orig_pmd); + haddr = address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(vma, haddr, pmd, entry, dirty)) + update_mmu_cache_pmd(vma, address, pmd); + +unlock: + spin_unlock(&mm->page_table_lock); +} + static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, diff --git a/mm/memory.c b/mm/memory.c index 221fc9f..7653773 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3537,8 +3537,9 @@ retry: barrier(); if (pmd_trans_huge(orig_pmd)) { - if (flags & FAULT_FLAG_WRITE && - !pmd_write(orig_pmd) && + unsigned int dirty = flags & FAULT_FLAG_WRITE; + + if (dirty && !pmd_write(orig_pmd) && !pmd_trans_splitting(orig_pmd)) { ret = do_huge_pmd_wp_page(mm, vma, address, pmd, orig_pmd); @@ -3550,6 +3551,9 @@ retry: if (unlikely(ret & VM_FAULT_OOM)) goto retry; return ret; + } else { + huge_pmd_set_accessed(mm, vma, address, pmd, + orig_pmd, dirty); } return 0; } -- cgit v1.1 From 9ff4868e3051d9128a24dd330bed32011a11421d Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 11 Dec 2012 16:01:30 -0800 Subject: mm, oom: allow exiting threads to have access to memory reserves Exiting threads, those with PF_EXITING set, can pagefault and require memory before they can make forward progress. This happens, for instance, when a process must fault task->robust_list, a userspace structure, before detaching its memory. These threads also aren't guaranteed to get access to memory reserves unless oom killed or killed from userspace. The oom killer won't grant memory reserves if other threads are also exiting other than current and stalling at the same point. This prevents needlessly killing processes when others are already exiting. Instead of special casing all the possible situations between PF_EXITING getting set and a thread detaching its mm where it may allocate memory, which probably wouldn't get updated when a change is made to the exit path, the solution is to give all exiting threads access to memory reserves if they call the oom killer. This allows them to quickly allocate, detach its mm, and free the memory it represents. Summary of Luigi's bug report: : He had an oom condition where threads were faulting on task->robust_list : and repeatedly called the oom killer but it would defer killing a thread : because it saw other PF_EXITING threads. This can happen anytime we need : to allocate memory after setting PF_EXITING and before detaching our mm; : if there are other threads in the same state then the oom killer won't do : anything unless one of them happens to be killed from userspace. : : So instead of only deferring for PF_EXITING and !task->robust_list, it's : better to just give them access to memory reserves to prevent a potential : livelock so that any other faults that may be introduced in the future in : the exit path don't cause the same problem (and hopefully we don't allow : too many of those!). Signed-off-by: David Rientjes Acked-by: Minchan Kim Tested-by: Luigi Semenzato Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 31 +++++++++---------------------- 1 file changed, 9 insertions(+), 22 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 79e0f3e..7e9e911 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -310,26 +310,13 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, if (!task->mm) return OOM_SCAN_CONTINUE; - if (task->flags & PF_EXITING) { + if (task->flags & PF_EXITING && !force_kill) { /* - * If task is current and is in the process of releasing memory, - * allow the "kill" to set TIF_MEMDIE, which will allow it to - * access memory reserves. Otherwise, it may stall forever. - * - * The iteration isn't broken here, however, in case other - * threads are found to have already been oom killed. + * If this task is not being ptraced on exit, then wait for it + * to finish before killing some other task unnecessarily. */ - if (task == current) - return OOM_SCAN_SELECT; - else if (!force_kill) { - /* - * If this task is not being ptraced on exit, then wait - * for it to finish before killing some other task - * unnecessarily. - */ - if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) - return OOM_SCAN_ABORT; - } + if (!(task->group_leader->ptrace & PT_TRACE_EXIT)) + return OOM_SCAN_ABORT; } return OOM_SCAN_OK; } @@ -706,11 +693,11 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask, return; /* - * If current has a pending SIGKILL, then automatically select it. The - * goal is to allow it to allocate so that it may quickly exit and free - * its memory. + * If current has a pending SIGKILL or is exiting, then automatically + * select it. The goal is to allow it to allocate so that it may + * quickly exit and free its memory. */ - if (fatal_signal_pending(current)) { + if (fatal_signal_pending(current) || current->flags & PF_EXITING) { set_thread_flag(TIF_MEMDIE); return; } -- cgit v1.1 From 5de55b265a13bc263c823bbe05d87d2c5e785f6f Mon Sep 17 00:00:00 2001 From: Matthieu CASTET Date: Tue, 11 Dec 2012 16:01:31 -0800 Subject: dmapool: make DMAPOOL_DEBUG detect corruption of free marker This can help to catch the case where hardware is writing after dma free. [akpm@linux-foundation.org: tidy code, fix comment, use sizeof(page->offset), use pr_err()] Signed-off-by: Matthieu Castet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/dmapool.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'mm') diff --git a/mm/dmapool.c b/mm/dmapool.c index da1b0f0..c69781e 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -332,6 +332,30 @@ void *dma_pool_alloc(struct dma_pool *pool, gfp_t mem_flags, retval = offset + page->vaddr; *handle = offset + page->dma; #ifdef DMAPOOL_DEBUG + { + int i; + u8 *data = retval; + /* page->offset is stored in first 4 bytes */ + for (i = sizeof(page->offset); i < pool->size; i++) { + if (data[i] == POOL_POISON_FREED) + continue; + if (pool->dev) + dev_err(pool->dev, + "dma_pool_alloc %s, %p (corruped)\n", + pool->name, retval); + else + pr_err("dma_pool_alloc %s, %p (corruped)\n", + pool->name, retval); + + /* + * Dump the first 4 bytes even if they are not + * POOL_POISON_FREED + */ + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_OFFSET, 16, 1, + data, pool->size, 1); + break; + } + } memset(retval, POOL_POISON_ALLOCATED, pool->size); #endif spin_unlock_irqrestore(&pool->lock, flags); -- cgit v1.1 From ff604cf6d41f1e05f34762e1d764fe14a0f5f964 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Tue, 11 Dec 2012 16:01:32 -0800 Subject: mm: hwpoison: fix action_result() to print out dirty/clean action_result() fails to print out "dirty" even if an error occurred on a dirty pagecache, because when we check PageDirty in action_result() it was cleared after page isolation even if it's dirty before error handling. This can break some applications that monitor this message, so should be fixed. There are several callers of action_result() except page_action(), but either of them are not for LRU pages but for free pages or kernel pages, so we don't have to consider dirty or not for them. Note that PG_dirty can be set outside page locks as described in commit 6746aff74da2 ("HWPOISON: shmem: call set_page_dirty() with locked page"), so this patch does not completely closes the race window, but just narrows it. Signed-off-by: Naoya Horiguchi Reviewed-by: Andi Kleen Cc: Tony Luck Cc: "Jun'ichi Nomura" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 2c9fc73..108c52f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -781,16 +781,16 @@ static struct page_state { { compound, compound, "huge", me_huge_page }, #endif - { sc|dirty, sc|dirty, "swapcache", me_swapcache_dirty }, - { sc|dirty, sc, "swapcache", me_swapcache_clean }, + { sc|dirty, sc|dirty, "dirty swapcache", me_swapcache_dirty }, + { sc|dirty, sc, "clean swapcache", me_swapcache_clean }, - { unevict|dirty, unevict|dirty, "unevictable LRU", me_pagecache_dirty}, - { unevict, unevict, "unevictable LRU", me_pagecache_clean}, + { unevict|dirty, unevict|dirty, "dirty unevictable LRU", me_pagecache_dirty }, + { unevict, unevict, "clean unevictable LRU", me_pagecache_clean }, - { mlock|dirty, mlock|dirty, "mlocked LRU", me_pagecache_dirty }, - { mlock, mlock, "mlocked LRU", me_pagecache_clean }, + { mlock|dirty, mlock|dirty, "dirty mlocked LRU", me_pagecache_dirty }, + { mlock, mlock, "clean mlocked LRU", me_pagecache_clean }, - { lru|dirty, lru|dirty, "LRU", me_pagecache_dirty }, + { lru|dirty, lru|dirty, "dirty LRU", me_pagecache_dirty }, { lru|dirty, lru, "clean LRU", me_pagecache_clean }, /* @@ -812,14 +812,14 @@ static struct page_state { #undef slab #undef reserved +/* + * "Dirty/Clean" indication is not 100% accurate due to the possibility of + * setting PG_dirty outside page lock. See also comment above set_page_dirty(). + */ static void action_result(unsigned long pfn, char *msg, int result) { - struct page *page = pfn_to_page(pfn); - - printk(KERN_ERR "MCE %#lx: %s%s page recovery: %s\n", - pfn, - PageDirty(page) ? "dirty " : "", - msg, action_name[result]); + pr_err("MCE %#lx: %s page recovery: %s\n", + pfn, msg, action_name[result]); } static int page_action(struct page_state *ps, struct page *p, -- cgit v1.1 From 42d7395feb56f0655cd8b68e06fc6063823449f8 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Tue, 11 Dec 2012 16:01:34 -0800 Subject: mm: support more pagesizes for MAP_HUGETLB/SHM_HUGETLB There was some desire in large applications using MAP_HUGETLB or SHM_HUGETLB to use 1GB huge pages on some mappings, and stay with 2MB on others. This is useful together with NUMA policy: use 2MB interleaving on some mappings, but 1GB on local mappings. This patch extends the IPC/SHM syscall interfaces slightly to allow specifying the page size. It borrows some upper bits in the existing flag arguments and allows encoding the log of the desired page size in addition to the *_HUGETLB flag. When 0 is specified the default size is used, this makes the change fully compatible. Extending the internal hugetlb code to handle this is straight forward. Instead of a single mount it just keeps an array of them and selects the right mount based on the specified page size. When no page size is specified it uses the mount of the default page size. The change is not visible in /proc/mounts because internal mounts don't appear there. It also has very little overhead: the additional mounts just consume a super block, but not more memory when not used. I also exported the new flags to the user headers (they were previously under __KERNEL__). Right now only symbols for x86 and some other architecture for 1GB and 2MB are defined. The interface should already work for all other architectures though. Only architectures that define multiple hugetlb sizes actually need it (that is currently x86, tile, powerpc). However tile and powerpc have user configurable hugetlb sizes, so it's not easy to add defines. A program on those architectures would need to query sysfs and use the appropiate log2. [akpm@linux-foundation.org: cleanups] [rientjes@google.com: fix build] [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Andi Kleen Cc: Michael Kerrisk Acked-by: Rik van Riel Acked-by: KAMEZAWA Hiroyuki Cc: Hillf Danton Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index 9a796c4..ebf1903 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1153,8 +1153,9 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, * memory so no accounting is necessary */ file = hugetlb_file_setup(HUGETLB_ANON_FILE, addr, len, - VM_NORESERVE, &user, - HUGETLB_ANONHUGE_INODE); + VM_NORESERVE, + &user, HUGETLB_ANONHUGE_INODE, + (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (IS_ERR(file)) return PTR_ERR(file); } -- cgit v1.1 From d37371870ceb1d2165397dc36114725b6dca946c Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 11 Dec 2012 16:01:38 -0800 Subject: mm: augment vma rbtree with rb_subtree_gap Define vma->rb_subtree_gap as the largest gap between any vma in the subtree rooted at that vma, and their predecessor. Or, for a recursive definition, vma->rb_subtree_gap is the max of: - vma->vm_start - vma->vm_prev->vm_end - rb_subtree_gap fields of the vmas pointed by vma->rb.rb_left and vma->rb.rb_right This will allow get_unmapped_area_* to find a free area of the right size in O(log(N)) time, instead of potentially having to do a linear walk across all the VMAs. Also define mm->highest_vm_end as the vm_end field of the highest vma, so that we can easily check if the following gap is suitable. This does have the potential to make unmapping VMAs more expensive, especially for processes with very large numbers of VMAs, where the VMA rbtree can grow quite deep. Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Cc: Hugh Dickins Cc: Russell King Cc: Ralf Baechle Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 143 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 132 insertions(+), 11 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index ebf1903..bdcea63 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include @@ -297,6 +298,27 @@ out: return retval; } +static long vma_compute_subtree_gap(struct vm_area_struct *vma) +{ + unsigned long max, subtree_gap; + max = vma->vm_start; + if (vma->vm_prev) + max -= vma->vm_prev->vm_end; + if (vma->vm_rb.rb_left) { + subtree_gap = rb_entry(vma->vm_rb.rb_left, + struct vm_area_struct, vm_rb)->rb_subtree_gap; + if (subtree_gap > max) + max = subtree_gap; + } + if (vma->vm_rb.rb_right) { + subtree_gap = rb_entry(vma->vm_rb.rb_right, + struct vm_area_struct, vm_rb)->rb_subtree_gap; + if (subtree_gap > max) + max = subtree_gap; + } + return max; +} + #ifdef CONFIG_DEBUG_VM_RB static int browse_rb(struct rb_root *root) { @@ -327,6 +349,18 @@ static int browse_rb(struct rb_root *root) return i; } +static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) +{ + struct rb_node *nd; + + for (nd = rb_first(root); nd; nd = rb_next(nd)) { + struct vm_area_struct *vma; + vma = rb_entry(nd, struct vm_area_struct, vm_rb); + BUG_ON(vma != ignore && + vma->rb_subtree_gap != vma_compute_subtree_gap(vma)); + } +} + void validate_mm(struct mm_struct *mm) { int bug = 0; @@ -349,9 +383,52 @@ void validate_mm(struct mm_struct *mm) BUG_ON(bug); } #else +#define validate_mm_rb(root, ignore) do { } while (0) #define validate_mm(mm) do { } while (0) #endif +RB_DECLARE_CALLBACKS(static, vma_gap_callbacks, struct vm_area_struct, vm_rb, + unsigned long, rb_subtree_gap, vma_compute_subtree_gap) + +/* + * Update augmented rbtree rb_subtree_gap values after vma->vm_start or + * vma->vm_prev->vm_end values changed, without modifying the vma's position + * in the rbtree. + */ +static void vma_gap_update(struct vm_area_struct *vma) +{ + /* + * As it turns out, RB_DECLARE_CALLBACKS() already created a callback + * function that does exacltly what we want. + */ + vma_gap_callbacks_propagate(&vma->vm_rb, NULL); +} + +static inline void vma_rb_insert(struct vm_area_struct *vma, + struct rb_root *root) +{ + /* All rb_subtree_gap values must be consistent prior to insertion */ + validate_mm_rb(root, NULL); + + rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); +} + +static void vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) +{ + /* + * All rb_subtree_gap values must be consistent prior to erase, + * with the possible exception of the vma being erased. + */ + validate_mm_rb(root, vma); + + /* + * Note rb_erase_augmented is a fairly large inline function, + * so make sure we instantiate it only once with our desired + * augmented rbtree callbacks. + */ + rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); +} + /* * vma has some anon_vma assigned, and is already inserted on that * anon_vma's interval trees. @@ -421,8 +498,25 @@ static int find_vma_links(struct mm_struct *mm, unsigned long addr, void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, struct rb_node **rb_link, struct rb_node *rb_parent) { + /* Update tracking information for the gap following the new vma. */ + if (vma->vm_next) + vma_gap_update(vma->vm_next); + else + mm->highest_vm_end = vma->vm_end; + + /* + * vma->vm_prev wasn't known when we followed the rbtree to find the + * correct insertion point for that vma. As a result, we could not + * update the vma vm_rb parents rb_subtree_gap values on the way down. + * So, we first insert the vma with a zero rb_subtree_gap value + * (to be consistent with what we did on the way down), and then + * immediately update the gap to the correct value. Finally we + * rebalance the rbtree after all augmented values have been set. + */ rb_link_node(&vma->vm_rb, rb_parent, rb_link); - rb_insert_color(&vma->vm_rb, &mm->mm_rb); + vma->rb_subtree_gap = 0; + vma_gap_update(vma); + vma_rb_insert(vma, &mm->mm_rb); } static void __vma_link_file(struct vm_area_struct *vma) @@ -498,12 +592,12 @@ static inline void __vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *prev) { - struct vm_area_struct *next = vma->vm_next; + struct vm_area_struct *next; - prev->vm_next = next; + vma_rb_erase(vma, &mm->mm_rb); + prev->vm_next = next = vma->vm_next; if (next) next->vm_prev = prev; - rb_erase(&vma->vm_rb, &mm->mm_rb); if (mm->mmap_cache == vma) mm->mmap_cache = prev; } @@ -525,6 +619,7 @@ int vma_adjust(struct vm_area_struct *vma, unsigned long start, struct rb_root *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; + bool start_changed = false, end_changed = false; long adjust_next = 0; int remove_next = 0; @@ -615,8 +710,14 @@ again: remove_next = 1 + (end > next->vm_end); vma_interval_tree_remove(next, root); } - vma->vm_start = start; - vma->vm_end = end; + if (start != vma->vm_start) { + vma->vm_start = start; + start_changed = true; + } + if (end != vma->vm_end) { + vma->vm_end = end; + end_changed = true; + } vma->vm_pgoff = pgoff; if (adjust_next) { next->vm_start += adjust_next << PAGE_SHIFT; @@ -645,6 +746,15 @@ again: remove_next = 1 + (end > next->vm_end); * (it may either follow vma or precede it). */ __insert_vm_struct(mm, insert); + } else { + if (start_changed) + vma_gap_update(vma); + if (end_changed) { + if (!next) + mm->highest_vm_end = end; + else if (!adjust_next) + vma_gap_update(next); + } } if (anon_vma) { @@ -678,10 +788,13 @@ again: remove_next = 1 + (end > next->vm_end); * we must remove another next too. It would clutter * up the code too much to do both in one go. */ - if (remove_next == 2) { - next = vma->vm_next; + next = vma->vm_next; + if (remove_next == 2) goto again; - } + else if (next) + vma_gap_update(next); + else + mm->highest_vm_end = end; } if (insert && file) uprobe_mmap(insert); @@ -1784,6 +1897,10 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) anon_vma_interval_tree_pre_update_vma(vma); vma->vm_end = address; anon_vma_interval_tree_post_update_vma(vma); + if (vma->vm_next) + vma_gap_update(vma->vm_next); + else + vma->vm_mm->highest_vm_end = address; perf_event_mmap(vma); } } @@ -1838,6 +1955,7 @@ int expand_downwards(struct vm_area_struct *vma, vma->vm_start = address; vma->vm_pgoff -= grow; anon_vma_interval_tree_post_update_vma(vma); + vma_gap_update(vma); perf_event_mmap(vma); } } @@ -1960,14 +2078,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; do { - rb_erase(&vma->vm_rb, &mm->mm_rb); + vma_rb_erase(vma, &mm->mm_rb); mm->map_count--; tail_vma = vma; vma = vma->vm_next; } while (vma && vma->vm_start < end); *insertion_point = vma; - if (vma) + if (vma) { vma->vm_prev = prev; + vma_gap_update(vma); + } else + mm->highest_vm_end = prev ? prev->vm_end : 0; tail_vma->vm_next = NULL; if (mm->unmap_area == arch_unmap_area) addr = prev ? prev->vm_end : mm->mmap_base; -- cgit v1.1 From 5a0768f641a5bad844860e67250baf0d1aa5e03c Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 11 Dec 2012 16:01:42 -0800 Subject: mm: check rb_subtree_gap correctness When CONFIG_DEBUG_VM_RB is enabled, check that rb_subtree_gap is correctly set for every vma and that mm->highest_vm_end is also correct. Also add an explicit 'bug' variable to track if browse_rb() detected any invalid condition. [akpm@linux-foundation.org: repair innovative coding-style inventions] Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Cc: Hugh Dickins Cc: Russell King Cc: Ralf Baechle Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 53 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 39 insertions(+), 14 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index bdcea63..ff93f6c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -322,31 +322,45 @@ static long vma_compute_subtree_gap(struct vm_area_struct *vma) #ifdef CONFIG_DEBUG_VM_RB static int browse_rb(struct rb_root *root) { - int i = 0, j; + int i = 0, j, bug = 0; struct rb_node *nd, *pn = NULL; unsigned long prev = 0, pend = 0; for (nd = rb_first(root); nd; nd = rb_next(nd)) { struct vm_area_struct *vma; vma = rb_entry(nd, struct vm_area_struct, vm_rb); - if (vma->vm_start < prev) - printk("vm_start %lx prev %lx\n", vma->vm_start, prev), i = -1; - if (vma->vm_start < pend) + if (vma->vm_start < prev) { + printk("vm_start %lx prev %lx\n", vma->vm_start, prev); + bug = 1; + } + if (vma->vm_start < pend) { printk("vm_start %lx pend %lx\n", vma->vm_start, pend); - if (vma->vm_start > vma->vm_end) - printk("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); + bug = 1; + } + if (vma->vm_start > vma->vm_end) { + printk("vm_end %lx < vm_start %lx\n", + vma->vm_end, vma->vm_start); + bug = 1; + } + if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { + printk("free gap %lx, correct %lx\n", + vma->rb_subtree_gap, + vma_compute_subtree_gap(vma)); + bug = 1; + } i++; pn = nd; prev = vma->vm_start; pend = vma->vm_end; } j = 0; - for (nd = pn; nd; nd = rb_prev(nd)) { + for (nd = pn; nd; nd = rb_prev(nd)) j++; + if (i != j) { + printk("backwards %d, forwards %d\n", j, i); + bug = 1; } - if (i != j) - printk("backwards %d, forwards %d\n", j, i), i = 0; - return i; + return bug ? -1 : i; } static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) @@ -365,6 +379,7 @@ void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; + unsigned long highest_address = 0; struct vm_area_struct *vma = mm->mmap; while (vma) { struct anon_vma_chain *avc; @@ -372,14 +387,24 @@ void validate_mm(struct mm_struct *mm) list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) anon_vma_interval_tree_verify(avc); vma_unlock_anon_vma(vma); + highest_address = vma->vm_end; vma = vma->vm_next; i++; } - if (i != mm->map_count) - printk("map_count %d vm_next %d\n", mm->map_count, i), bug = 1; + if (i != mm->map_count) { + printk("map_count %d vm_next %d\n", mm->map_count, i); + bug = 1; + } + if (highest_address != mm->highest_vm_end) { + printk("mm->highest_vm_end %lx, found %lx\n", + mm->highest_vm_end, highest_address); + bug = 1; + } i = browse_rb(&mm->mm_rb); - if (i != mm->map_count) - printk("map_count %d rb %d\n", mm->map_count, i), bug = 1; + if (i != mm->map_count) { + printk("map_count %d rb %d\n", mm->map_count, i); + bug = 1; + } BUG_ON(bug); } #else -- cgit v1.1 From db4fbfb9523c93583c339e66023506f651c1d54b Mon Sep 17 00:00:00 2001 From: Michel Lespinasse Date: Tue, 11 Dec 2012 16:01:49 -0800 Subject: mm: vm_unmapped_area() lookup function Implement vm_unmapped_area() using the rb_subtree_gap and highest_vm_end information to look up for suitable virtual address space gaps. struct vm_unmapped_area_info is used to define the desired allocation request: - lowest or highest possible address matching the remaining constraints - desired gap length - low/high address limits that the gap must fit into - alignment mask and offset Also update the generic arch_get_unmapped_area[_topdown] functions to make use of vm_unmapped_area() instead of implementing a brute force search. [akpm@linux-foundation.org: checkpatch fixes] Signed-off-by: Michel Lespinasse Reviewed-by: Rik van Riel Cc: Hugh Dickins Cc: Russell King Cc: Ralf Baechle Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 312 ++++++++++++++++++++++++++++++++++++++++++++------------------ 1 file changed, 222 insertions(+), 90 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index ff93f6c..5646677 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1539,6 +1539,206 @@ unacct_error: return error; } +unsigned long unmapped_area(struct vm_unmapped_area_info *info) +{ + /* + * We implement the search by looking for an rbtree node that + * immediately follows a suitable gap. That is, + * - gap_start = vma->vm_prev->vm_end <= info->high_limit - length; + * - gap_end = vma->vm_start >= info->low_limit + length; + * - gap_end - gap_start >= length + */ + + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long length, low_limit, high_limit, gap_start, gap_end; + + /* Adjust search length to account for worst case alignment overhead */ + length = info->length + info->align_mask; + if (length < info->length) + return -ENOMEM; + + /* Adjust search limits by the desired length */ + if (info->high_limit < length) + return -ENOMEM; + high_limit = info->high_limit - length; + + if (info->low_limit > high_limit) + return -ENOMEM; + low_limit = info->low_limit + length; + + /* Check if rbtree root looks promising */ + if (RB_EMPTY_ROOT(&mm->mm_rb)) + goto check_highest; + vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); + if (vma->rb_subtree_gap < length) + goto check_highest; + + while (true) { + /* Visit left subtree if it looks promising */ + gap_end = vma->vm_start; + if (gap_end >= low_limit && vma->vm_rb.rb_left) { + struct vm_area_struct *left = + rb_entry(vma->vm_rb.rb_left, + struct vm_area_struct, vm_rb); + if (left->rb_subtree_gap >= length) { + vma = left; + continue; + } + } + + gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; +check_current: + /* Check if current node has a suitable gap */ + if (gap_start > high_limit) + return -ENOMEM; + if (gap_end >= low_limit && gap_end - gap_start >= length) + goto found; + + /* Visit right subtree if it looks promising */ + if (vma->vm_rb.rb_right) { + struct vm_area_struct *right = + rb_entry(vma->vm_rb.rb_right, + struct vm_area_struct, vm_rb); + if (right->rb_subtree_gap >= length) { + vma = right; + continue; + } + } + + /* Go back up the rbtree to find next candidate node */ + while (true) { + struct rb_node *prev = &vma->vm_rb; + if (!rb_parent(prev)) + goto check_highest; + vma = rb_entry(rb_parent(prev), + struct vm_area_struct, vm_rb); + if (prev == vma->vm_rb.rb_left) { + gap_start = vma->vm_prev->vm_end; + gap_end = vma->vm_start; + goto check_current; + } + } + } + +check_highest: + /* Check highest gap, which does not precede any rbtree node */ + gap_start = mm->highest_vm_end; + gap_end = ULONG_MAX; /* Only for VM_BUG_ON below */ + if (gap_start > high_limit) + return -ENOMEM; + +found: + /* We found a suitable gap. Clip it with the original low_limit. */ + if (gap_start < info->low_limit) + gap_start = info->low_limit; + + /* Adjust gap address to the desired alignment */ + gap_start += (info->align_offset - gap_start) & info->align_mask; + + VM_BUG_ON(gap_start + info->length > info->high_limit); + VM_BUG_ON(gap_start + info->length > gap_end); + return gap_start; +} + +unsigned long unmapped_area_topdown(struct vm_unmapped_area_info *info) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + unsigned long length, low_limit, high_limit, gap_start, gap_end; + + /* Adjust search length to account for worst case alignment overhead */ + length = info->length + info->align_mask; + if (length < info->length) + return -ENOMEM; + + /* + * Adjust search limits by the desired length. + * See implementation comment at top of unmapped_area(). + */ + gap_end = info->high_limit; + if (gap_end < length) + return -ENOMEM; + high_limit = gap_end - length; + + if (info->low_limit > high_limit) + return -ENOMEM; + low_limit = info->low_limit + length; + + /* Check highest gap, which does not precede any rbtree node */ + gap_start = mm->highest_vm_end; + if (gap_start <= high_limit) + goto found_highest; + + /* Check if rbtree root looks promising */ + if (RB_EMPTY_ROOT(&mm->mm_rb)) + return -ENOMEM; + vma = rb_entry(mm->mm_rb.rb_node, struct vm_area_struct, vm_rb); + if (vma->rb_subtree_gap < length) + return -ENOMEM; + + while (true) { + /* Visit right subtree if it looks promising */ + gap_start = vma->vm_prev ? vma->vm_prev->vm_end : 0; + if (gap_start <= high_limit && vma->vm_rb.rb_right) { + struct vm_area_struct *right = + rb_entry(vma->vm_rb.rb_right, + struct vm_area_struct, vm_rb); + if (right->rb_subtree_gap >= length) { + vma = right; + continue; + } + } + +check_current: + /* Check if current node has a suitable gap */ + gap_end = vma->vm_start; + if (gap_end < low_limit) + return -ENOMEM; + if (gap_start <= high_limit && gap_end - gap_start >= length) + goto found; + + /* Visit left subtree if it looks promising */ + if (vma->vm_rb.rb_left) { + struct vm_area_struct *left = + rb_entry(vma->vm_rb.rb_left, + struct vm_area_struct, vm_rb); + if (left->rb_subtree_gap >= length) { + vma = left; + continue; + } + } + + /* Go back up the rbtree to find next candidate node */ + while (true) { + struct rb_node *prev = &vma->vm_rb; + if (!rb_parent(prev)) + return -ENOMEM; + vma = rb_entry(rb_parent(prev), + struct vm_area_struct, vm_rb); + if (prev == vma->vm_rb.rb_right) { + gap_start = vma->vm_prev ? + vma->vm_prev->vm_end : 0; + goto check_current; + } + } + } + +found: + /* We found a suitable gap. Clip it with the original high_limit. */ + if (gap_end > info->high_limit) + gap_end = info->high_limit; + +found_highest: + /* Compute highest gap address at the desired alignment */ + gap_end -= info->length; + gap_end -= (gap_end - info->align_offset) & info->align_mask; + + VM_BUG_ON(gap_end < info->low_limit); + VM_BUG_ON(gap_end < gap_start); + return gap_end; +} + /* Get an address range which is currently unmapped. * For shmat() with addr=0. * @@ -1557,7 +1757,7 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; - unsigned long start_addr; + struct vm_unmapped_area_info info; if (len > TASK_SIZE) return -ENOMEM; @@ -1572,40 +1772,13 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, (!vma || addr + len <= vma->vm_start)) return addr; } - if (len > mm->cached_hole_size) { - start_addr = addr = mm->free_area_cache; - } else { - start_addr = addr = TASK_UNMAPPED_BASE; - mm->cached_hole_size = 0; - } -full_search: - for (vma = find_vma(mm, addr); ; vma = vma->vm_next) { - /* At this point: (!vma || addr < vma->vm_end). */ - if (TASK_SIZE - len < addr) { - /* - * Start a new search - just in case we missed - * some holes. - */ - if (start_addr != TASK_UNMAPPED_BASE) { - addr = TASK_UNMAPPED_BASE; - start_addr = addr; - mm->cached_hole_size = 0; - goto full_search; - } - return -ENOMEM; - } - if (!vma || addr + len <= vma->vm_start) { - /* - * Remember the place where we stopped the search: - */ - mm->free_area_cache = addr + len; - return addr; - } - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - addr = vma->vm_end; - } + info.flags = 0; + info.length = len; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + info.align_mask = 0; + return vm_unmapped_area(&info); } #endif @@ -1630,7 +1803,8 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, { struct vm_area_struct *vma; struct mm_struct *mm = current->mm; - unsigned long addr = addr0, start_addr; + unsigned long addr = addr0; + struct vm_unmapped_area_info info; /* requested length too big for entire address space */ if (len > TASK_SIZE) @@ -1648,53 +1822,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, return addr; } - /* check if free_area_cache is useful for us */ - if (len <= mm->cached_hole_size) { - mm->cached_hole_size = 0; - mm->free_area_cache = mm->mmap_base; - } - -try_again: - /* either no address requested or can't fit in requested address hole */ - start_addr = addr = mm->free_area_cache; - - if (addr < len) - goto fail; - - addr -= len; - do { - /* - * Lookup failure means no vma is above this address, - * else if new region fits below vma->vm_start, - * return with success: - */ - vma = find_vma(mm, addr); - if (!vma || addr+len <= vma->vm_start) - /* remember the address as a hint for next time */ - return (mm->free_area_cache = addr); - - /* remember the largest hole we saw so far */ - if (addr + mm->cached_hole_size < vma->vm_start) - mm->cached_hole_size = vma->vm_start - addr; - - /* try just below the current vma->vm_start */ - addr = vma->vm_start-len; - } while (len < vma->vm_start); - -fail: - /* - * if hint left us with no space for the requested - * mapping then try again: - * - * Note: this is different with the case of bottomup - * which does the fully line-search, but we use find_vma - * here that causes some holes skipped. - */ - if (start_addr != mm->mmap_base) { - mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = 0; - goto try_again; - } + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; + info.low_limit = PAGE_SIZE; + info.high_limit = mm->mmap_base; + info.align_mask = 0; + addr = vm_unmapped_area(&info); /* * A failed mmap() very likely causes application failure, @@ -1702,14 +1835,13 @@ fail: * can happen with large stack limits and large mmap() * allocations. */ - mm->cached_hole_size = ~0UL; - mm->free_area_cache = TASK_UNMAPPED_BASE; - addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); - /* - * Restore the topdown base: - */ - mm->free_area_cache = mm->mmap_base; - mm->cached_hole_size = ~0UL; + if (addr & ~PAGE_MASK) { + VM_BUG_ON(addr != -ENOMEM); + info.flags = 0; + info.low_limit = TASK_UNMAPPED_BASE; + info.high_limit = TASK_SIZE; + addr = vm_unmapped_area(&info); + } return addr; } -- cgit v1.1 From 78bd52097d04205a33a8014a1b8ac01cf1ae9d06 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Tue, 11 Dec 2012 16:02:31 -0800 Subject: mm: adjust address_space_operations.migratepage() return code Memory fragmentation introduced by ballooning might reduce significantly the number of 2MB contiguous memory blocks that can be used within a guest, thus imposing performance penalties associated with the reduced number of transparent huge pages that could be used by the guest workload. This patch-set follows the main idea discussed at 2012 LSFMMS session: "Ballooning for transparent huge pages" -- http://lwn.net/Articles/490114/ to introduce the required changes to the virtio_balloon driver, as well as the changes to the core compaction & migration bits, in order to make those subsystems aware of ballooned pages and allow memory balloon pages become movable within a guest, thus avoiding the aforementioned fragmentation issue Following are numbers that prove this patch benefits on allowing compaction to be more effective at memory ballooned guests. Results for STRESS-HIGHALLOC benchmark, from Mel Gorman's mmtests suite, running on a 4gB RAM KVM guest which was ballooning 512mB RAM in 64mB chunks, at every minute (inflating/deflating), while test was running: ===BEGIN stress-highalloc STRESS-HIGHALLOC highalloc-3.7 highalloc-3.7 rc4-clean rc4-patch Pass 1 55.00 ( 0.00%) 62.00 ( 7.00%) Pass 2 54.00 ( 0.00%) 62.00 ( 8.00%) while Rested 75.00 ( 0.00%) 80.00 ( 5.00%) MMTests Statistics: duration 3.7 3.7 rc4-clean rc4-patch User 1207.59 1207.46 System 1300.55 1299.61 Elapsed 2273.72 2157.06 MMTests Statistics: vmstat 3.7 3.7 rc4-clean rc4-patch Page Ins 3581516 2374368 Page Outs 11148692 10410332 Swap Ins 80 47 Swap Outs 3641 476 Direct pages scanned 37978 33826 Kswapd pages scanned 1828245 1342869 Kswapd pages reclaimed 1710236 1304099 Direct pages reclaimed 32207 31005 Kswapd efficiency 93% 97% Kswapd velocity 804.077 622.546 Direct efficiency 84% 91% Direct velocity 16.703 15.682 Percentage direct scans 2% 2% Page writes by reclaim 79252 9704 Page writes file 75611 9228 Page writes anon 3641 476 Page reclaim immediate 16764 11014 Page rescued immediate 0 0 Slabs scanned 2171904 2152448 Direct inode steals 385 2261 Kswapd inode steals 659137 609670 Kswapd skipped wait 1 69 THP fault alloc 546 631 THP collapse alloc 361 339 THP splits 259 263 THP fault fallback 98 50 THP collapse fail 20 17 Compaction stalls 747 499 Compaction success 244 145 Compaction failures 503 354 Compaction pages moved 370888 474837 Compaction move failure 77378 65259 ===END stress-highalloc This patch: Introduce MIGRATEPAGE_SUCCESS as the default return code for address_space_operations.migratepage() method and documents the expected return code for the same method in failure cases. Signed-off-by: Rafael Aquini Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Rik van Riel Cc: Mel Gorman Cc: Andi Kleen Cc: Konrad Rzeszutek Wilk Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/migrate.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/migrate.c b/mm/migrate.c index 1dc4598..33f5f82 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -276,7 +276,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, /* Anonymous page without mapping */ if (page_count(page) != 1) return -EAGAIN; - return 0; + return MIGRATEPAGE_SUCCESS; } spin_lock_irq(&mapping->tree_lock); @@ -346,7 +346,7 @@ static int migrate_page_move_mapping(struct address_space *mapping, } spin_unlock_irq(&mapping->tree_lock); - return 0; + return MIGRATEPAGE_SUCCESS; } /* @@ -362,7 +362,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, if (!mapping) { if (page_count(page) != 1) return -EAGAIN; - return 0; + return MIGRATEPAGE_SUCCESS; } spin_lock_irq(&mapping->tree_lock); @@ -389,7 +389,7 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, page_unfreeze_refs(page, expected_count - 1); spin_unlock_irq(&mapping->tree_lock); - return 0; + return MIGRATEPAGE_SUCCESS; } /* @@ -476,11 +476,11 @@ int migrate_page(struct address_space *mapping, rc = migrate_page_move_mapping(mapping, newpage, page, NULL, mode); - if (rc) + if (rc != MIGRATEPAGE_SUCCESS) return rc; migrate_page_copy(newpage, page); - return 0; + return MIGRATEPAGE_SUCCESS; } EXPORT_SYMBOL(migrate_page); @@ -503,7 +503,7 @@ int buffer_migrate_page(struct address_space *mapping, rc = migrate_page_move_mapping(mapping, newpage, page, head, mode); - if (rc) + if (rc != MIGRATEPAGE_SUCCESS) return rc; /* @@ -539,7 +539,7 @@ int buffer_migrate_page(struct address_space *mapping, } while (bh != head); - return 0; + return MIGRATEPAGE_SUCCESS; } EXPORT_SYMBOL(buffer_migrate_page); #endif @@ -618,7 +618,7 @@ static int fallback_migrate_page(struct address_space *mapping, * * Return value: * < 0 - error code - * == 0 - success + * MIGRATEPAGE_SUCCESS - success */ static int move_to_new_page(struct page *newpage, struct page *page, int remap_swapcache, enum migrate_mode mode) @@ -655,7 +655,7 @@ static int move_to_new_page(struct page *newpage, struct page *page, else rc = fallback_migrate_page(mapping, newpage, page, mode); - if (rc) { + if (rc != MIGRATEPAGE_SUCCESS) { newpage->mapping = NULL; } else { if (remap_swapcache) @@ -804,7 +804,7 @@ skip_unmap: put_anon_vma(anon_vma); uncharge: - mem_cgroup_end_migration(mem, page, newpage, rc == 0); + mem_cgroup_end_migration(mem, page, newpage, rc == MIGRATEPAGE_SUCCESS); unlock: unlock_page(page); out: @@ -977,7 +977,7 @@ int migrate_pages(struct list_head *from, case -EAGAIN: retry++; break; - case 0: + case MIGRATEPAGE_SUCCESS: break; default: /* Permanent failure */ @@ -986,15 +986,12 @@ int migrate_pages(struct list_head *from, } } } - rc = 0; + rc = nr_failed + retry; out: if (!swapwrite) current->flags &= ~PF_SWAPWRITE; - if (rc) - return rc; - - return nr_failed + retry; + return rc; } int migrate_huge_page(struct page *hpage, new_page_t get_new_page, @@ -1014,7 +1011,7 @@ int migrate_huge_page(struct page *hpage, new_page_t get_new_page, /* try again */ cond_resched(); break; - case 0: + case MIGRATEPAGE_SUCCESS: goto out; default: rc = -EIO; -- cgit v1.1 From 18468d93e53b037e1a04ec58398eab763d054064 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Tue, 11 Dec 2012 16:02:38 -0800 Subject: mm: introduce a common interface for balloon pages mobility Memory fragmentation introduced by ballooning might reduce significantly the number of 2MB contiguous memory blocks that can be used within a guest, thus imposing performance penalties associated with the reduced number of transparent huge pages that could be used by the guest workload. This patch introduces a common interface to help a balloon driver on making its page set movable to compaction, and thus allowing the system to better leverage the compation efforts on memory defragmentation. [akpm@linux-foundation.org: use PAGE_FLAGS_CHECK_AT_PREP, s/__balloon_page_flags/page_flags_cleared/, small cleanups] [rientjes@google.com: allow balloon compaction for any system with memory compaction enabled, which is the defconfig] Signed-off-by: Rafael Aquini Acked-by: Mel Gorman Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Rik van Riel Cc: Andi Kleen Cc: Konrad Rzeszutek Wilk Cc: Minchan Kim Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 15 +++ mm/Makefile | 3 +- mm/balloon_compaction.c | 302 ++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 319 insertions(+), 1 deletion(-) create mode 100644 mm/balloon_compaction.c (limited to 'mm') diff --git a/mm/Kconfig b/mm/Kconfig index a3f8ddd..e6651c5 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -188,6 +188,21 @@ config SPLIT_PTLOCK_CPUS default "4" # +# support for memory balloon compaction +config BALLOON_COMPACTION + bool "Allow for balloon memory compaction/migration" + def_bool y + depends on COMPACTION && VIRTIO_BALLOON + help + Memory fragmentation introduced by ballooning might reduce + significantly the number of 2MB contiguous memory blocks that can be + used within a guest, thus imposing performance penalties associated + with the reduced number of transparent huge pages that could be used + by the guest workload. Allowing the compaction & migration for memory + pages enlisted as being part of memory balloon devices avoids the + scenario aforementioned and helps improving memory defragmentation. + +# # support for memory compaction config COMPACTION bool "Allow for memory compaction" diff --git a/mm/Makefile b/mm/Makefile index 6b025f8..3a46287 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -16,7 +16,8 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o mmu_context.o percpu.o slab_common.o \ - compaction.o interval_tree.o $(mmu-y) + compaction.o balloon_compaction.o \ + interval_tree.o $(mmu-y) obj-y += init-mm.o diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c new file mode 100644 index 0000000..07dbc8e --- /dev/null +++ b/mm/balloon_compaction.c @@ -0,0 +1,302 @@ +/* + * mm/balloon_compaction.c + * + * Common interface for making balloon pages movable by compaction. + * + * Copyright (C) 2012, Red Hat, Inc. Rafael Aquini + */ +#include +#include +#include +#include + +/* + * balloon_devinfo_alloc - allocates a balloon device information descriptor. + * @balloon_dev_descriptor: pointer to reference the balloon device which + * this struct balloon_dev_info will be servicing. + * + * Driver must call it to properly allocate and initialize an instance of + * struct balloon_dev_info which will be used to reference a balloon device + * as well as to keep track of the balloon device page list. + */ +struct balloon_dev_info *balloon_devinfo_alloc(void *balloon_dev_descriptor) +{ + struct balloon_dev_info *b_dev_info; + b_dev_info = kmalloc(sizeof(*b_dev_info), GFP_KERNEL); + if (!b_dev_info) + return ERR_PTR(-ENOMEM); + + b_dev_info->balloon_device = balloon_dev_descriptor; + b_dev_info->mapping = NULL; + b_dev_info->isolated_pages = 0; + spin_lock_init(&b_dev_info->pages_lock); + INIT_LIST_HEAD(&b_dev_info->pages); + + return b_dev_info; +} +EXPORT_SYMBOL_GPL(balloon_devinfo_alloc); + +/* + * balloon_page_enqueue - allocates a new page and inserts it into the balloon + * page list. + * @b_dev_info: balloon device decriptor where we will insert a new page to + * + * Driver must call it to properly allocate a new enlisted balloon page + * before definetively removing it from the guest system. + * This function returns the page address for the recently enqueued page or + * NULL in the case we fail to allocate a new page this turn. + */ +struct page *balloon_page_enqueue(struct balloon_dev_info *b_dev_info) +{ + unsigned long flags; + struct page *page = alloc_page(balloon_mapping_gfp_mask() | + __GFP_NOMEMALLOC | __GFP_NORETRY); + if (!page) + return NULL; + + /* + * Block others from accessing the 'page' when we get around to + * establishing additional references. We should be the only one + * holding a reference to the 'page' at this point. + */ + BUG_ON(!trylock_page(page)); + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + balloon_page_insert(page, b_dev_info->mapping, &b_dev_info->pages); + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + unlock_page(page); + return page; +} +EXPORT_SYMBOL_GPL(balloon_page_enqueue); + +/* + * balloon_page_dequeue - removes a page from balloon's page list and returns + * the its address to allow the driver release the page. + * @b_dev_info: balloon device decriptor where we will grab a page from. + * + * Driver must call it to properly de-allocate a previous enlisted balloon page + * before definetively releasing it back to the guest system. + * This function returns the page address for the recently dequeued page or + * NULL in the case we find balloon's page list temporarily empty due to + * compaction isolated pages. + */ +struct page *balloon_page_dequeue(struct balloon_dev_info *b_dev_info) +{ + struct page *page, *tmp; + unsigned long flags; + bool dequeued_page; + + dequeued_page = false; + list_for_each_entry_safe(page, tmp, &b_dev_info->pages, lru) { + /* + * Block others from accessing the 'page' while we get around + * establishing additional references and preparing the 'page' + * to be released by the balloon driver. + */ + if (trylock_page(page)) { + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + /* + * Raise the page refcount here to prevent any wrong + * attempt to isolate this page, in case of coliding + * with balloon_page_isolate() just after we release + * the page lock. + * + * balloon_page_free() will take care of dropping + * this extra refcount later. + */ + get_page(page); + balloon_page_delete(page); + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + unlock_page(page); + dequeued_page = true; + break; + } + } + + if (!dequeued_page) { + /* + * If we are unable to dequeue a balloon page because the page + * list is empty and there is no isolated pages, then something + * went out of track and some balloon pages are lost. + * BUG() here, otherwise the balloon driver may get stuck into + * an infinite loop while attempting to release all its pages. + */ + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + if (unlikely(list_empty(&b_dev_info->pages) && + !b_dev_info->isolated_pages)) + BUG(); + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); + page = NULL; + } + return page; +} +EXPORT_SYMBOL_GPL(balloon_page_dequeue); + +#ifdef CONFIG_BALLOON_COMPACTION +/* + * balloon_mapping_alloc - allocates a special ->mapping for ballooned pages. + * @b_dev_info: holds the balloon device information descriptor. + * @a_ops: balloon_mapping address_space_operations descriptor. + * + * Driver must call it to properly allocate and initialize an instance of + * struct address_space which will be used as the special page->mapping for + * balloon device enlisted page instances. + */ +struct address_space *balloon_mapping_alloc(struct balloon_dev_info *b_dev_info, + const struct address_space_operations *a_ops) +{ + struct address_space *mapping; + + mapping = kmalloc(sizeof(*mapping), GFP_KERNEL); + if (!mapping) + return ERR_PTR(-ENOMEM); + + /* + * Give a clean 'zeroed' status to all elements of this special + * balloon page->mapping struct address_space instance. + */ + address_space_init_once(mapping); + + /* + * Set mapping->flags appropriately, to allow balloon pages + * ->mapping identification. + */ + mapping_set_balloon(mapping); + mapping_set_gfp_mask(mapping, balloon_mapping_gfp_mask()); + + /* balloon's page->mapping->a_ops callback descriptor */ + mapping->a_ops = a_ops; + + /* + * Establish a pointer reference back to the balloon device descriptor + * this particular page->mapping will be servicing. + * This is used by compaction / migration procedures to identify and + * access the balloon device pageset while isolating / migrating pages. + * + * As some balloon drivers can register multiple balloon devices + * for a single guest, this also helps compaction / migration to + * properly deal with multiple balloon pagesets, when required. + */ + mapping->private_data = b_dev_info; + b_dev_info->mapping = mapping; + + return mapping; +} +EXPORT_SYMBOL_GPL(balloon_mapping_alloc); + +static inline void __isolate_balloon_page(struct page *page) +{ + struct balloon_dev_info *b_dev_info = page->mapping->private_data; + unsigned long flags; + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + list_del(&page->lru); + b_dev_info->isolated_pages++; + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); +} + +static inline void __putback_balloon_page(struct page *page) +{ + struct balloon_dev_info *b_dev_info = page->mapping->private_data; + unsigned long flags; + spin_lock_irqsave(&b_dev_info->pages_lock, flags); + list_add(&page->lru, &b_dev_info->pages); + b_dev_info->isolated_pages--; + spin_unlock_irqrestore(&b_dev_info->pages_lock, flags); +} + +static inline int __migrate_balloon_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + return page->mapping->a_ops->migratepage(mapping, newpage, page, mode); +} + +/* __isolate_lru_page() counterpart for a ballooned page */ +bool balloon_page_isolate(struct page *page) +{ + /* + * Avoid burning cycles with pages that are yet under __free_pages(), + * or just got freed under us. + * + * In case we 'win' a race for a balloon page being freed under us and + * raise its refcount preventing __free_pages() from doing its job + * the put_page() at the end of this block will take care of + * release this page, thus avoiding a nasty leakage. + */ + if (likely(get_page_unless_zero(page))) { + /* + * As balloon pages are not isolated from LRU lists, concurrent + * compaction threads can race against page migration functions + * as well as race against the balloon driver releasing a page. + * + * In order to avoid having an already isolated balloon page + * being (wrongly) re-isolated while it is under migration, + * or to avoid attempting to isolate pages being released by + * the balloon driver, lets be sure we have the page lock + * before proceeding with the balloon page isolation steps. + */ + if (likely(trylock_page(page))) { + /* + * A ballooned page, by default, has just one refcount. + * Prevent concurrent compaction threads from isolating + * an already isolated balloon page by refcount check. + */ + if (__is_movable_balloon_page(page) && + page_count(page) == 2) { + __isolate_balloon_page(page); + unlock_page(page); + return true; + } + unlock_page(page); + } + put_page(page); + } + return false; +} + +/* putback_lru_page() counterpart for a ballooned page */ +void balloon_page_putback(struct page *page) +{ + /* + * 'lock_page()' stabilizes the page and prevents races against + * concurrent isolation threads attempting to re-isolate it. + */ + lock_page(page); + + if (__is_movable_balloon_page(page)) { + __putback_balloon_page(page); + /* drop the extra ref count taken for page isolation */ + put_page(page); + } else { + WARN_ON(1); + dump_page(page); + } + unlock_page(page); +} + +/* move_to_new_page() counterpart for a ballooned page */ +int balloon_page_migrate(struct page *newpage, + struct page *page, enum migrate_mode mode) +{ + struct address_space *mapping; + int rc = -EAGAIN; + + /* + * Block others from accessing the 'newpage' when we get around to + * establishing additional references. We should be the only one + * holding a reference to the 'newpage' at this point. + */ + BUG_ON(!trylock_page(newpage)); + + if (WARN_ON(!__is_movable_balloon_page(page))) { + dump_page(page); + unlock_page(newpage); + return rc; + } + + mapping = page->mapping; + if (mapping) + rc = __migrate_balloon_page(mapping, newpage, page, mode); + + unlock_page(newpage); + return rc; +} +#endif /* CONFIG_BALLOON_COMPACTION */ -- cgit v1.1 From bf6bddf1924eaebf2beb85e4249a89dd16d4eed6 Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Tue, 11 Dec 2012 16:02:42 -0800 Subject: mm: introduce compaction and migration for ballooned pages Memory fragmentation introduced by ballooning might reduce significantly the number of 2MB contiguous memory blocks that can be used within a guest, thus imposing performance penalties associated with the reduced number of transparent huge pages that could be used by the guest workload. This patch introduces the helper functions as well as the necessary changes to teach compaction and migration bits how to cope with pages which are part of a guest memory balloon, in order to make them movable by memory compaction procedures. Signed-off-by: Rafael Aquini Acked-by: Mel Gorman Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Rik van Riel Cc: Andi Kleen Cc: Konrad Rzeszutek Wilk Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 21 +++++++++++++++++++-- mm/migrate.c | 34 ++++++++++++++++++++++++++++++++-- 2 files changed, 51 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 694eaab..470474c 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "internal.h" #if defined CONFIG_COMPACTION || defined CONFIG_CMA @@ -565,9 +566,24 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, goto next_pageblock; } - /* Check may be lockless but that's ok as we recheck later */ - if (!PageLRU(page)) + /* + * Check may be lockless but that's ok as we recheck later. + * It's possible to migrate LRU pages and balloon pages + * Skip any other type of page + */ + if (!PageLRU(page)) { + if (unlikely(balloon_page_movable(page))) { + if (locked && balloon_page_isolate(page)) { + /* Successfully isolated */ + cc->finished_update_migrate = true; + list_add(&page->lru, migratelist); + cc->nr_migratepages++; + nr_isolated++; + goto check_compact_cluster; + } + } continue; + } /* * PageLRU is set. lru_lock normally excludes isolation @@ -621,6 +637,7 @@ isolate_migratepages_range(struct zone *zone, struct compact_control *cc, cc->nr_migratepages++; nr_isolated++; +check_compact_cluster: /* Avoid isolating too much */ if (cc->nr_migratepages == COMPACT_CLUSTER_MAX) { ++low_pfn; diff --git a/mm/migrate.c b/mm/migrate.c index 33f5f82..427343c 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -35,6 +35,7 @@ #include #include #include +#include #include @@ -79,7 +80,10 @@ void putback_lru_pages(struct list_head *l) list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); - putback_lru_page(page); + if (unlikely(balloon_page_movable(page))) + balloon_page_putback(page); + else + putback_lru_page(page); } } @@ -768,6 +772,18 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } } + if (unlikely(balloon_page_movable(page))) { + /* + * A ballooned page does not need any special attention from + * physical to virtual reverse mapping procedures. + * Skip any attempt to unmap PTEs or to remap swap cache, + * in order to avoid burning cycles at rmap level, and perform + * the page migration right away (proteced by page lock). + */ + rc = balloon_page_migrate(newpage, page, mode); + goto uncharge; + } + /* * Corner case handling: * 1. When a new swap-cache page is read into, it is added to the LRU @@ -804,7 +820,9 @@ skip_unmap: put_anon_vma(anon_vma); uncharge: - mem_cgroup_end_migration(mem, page, newpage, rc == MIGRATEPAGE_SUCCESS); + mem_cgroup_end_migration(mem, page, newpage, + (rc == MIGRATEPAGE_SUCCESS || + rc == MIGRATEPAGE_BALLOON_SUCCESS)); unlock: unlock_page(page); out: @@ -836,6 +854,18 @@ static int unmap_and_move(new_page_t get_new_page, unsigned long private, goto out; rc = __unmap_and_move(page, newpage, force, offlining, mode); + + if (unlikely(rc == MIGRATEPAGE_BALLOON_SUCCESS)) { + /* + * A ballooned page has been migrated already. + * Now, it's the time to wrap-up counters, + * handle the page back to Buddy and return. + */ + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); + balloon_page_free(page); + return MIGRATEPAGE_SUCCESS; + } out: if (rc != -EAGAIN) { /* -- cgit v1.1 From 5733c7d11dff44e98d2ca16617886a78086b354f Mon Sep 17 00:00:00 2001 From: Rafael Aquini Date: Tue, 11 Dec 2012 16:02:47 -0800 Subject: mm: introduce putback_movable_pages() The PATCH "mm: introduce compaction and migration for virtio ballooned pages" hacks around putback_lru_pages() in order to allow ballooned pages to be re-inserted on balloon page list as if a ballooned page was like a LRU page. As ballooned pages are not legitimate LRU pages, this patch introduces putback_movable_pages() to properly cope with cases where the isolated pageset contains ballooned pages and LRU pages, thus fixing the mentioned inelegant hack around putback_lru_pages(). Signed-off-by: Rafael Aquini Cc: Rusty Russell Cc: "Michael S. Tsirkin" Cc: Rik van Riel Cc: Mel Gorman Cc: Andi Kleen Cc: Konrad Rzeszutek Wilk Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 6 +++--- mm/migrate.c | 20 ++++++++++++++++++++ mm/page_alloc.c | 2 +- 3 files changed, 24 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/compaction.c b/mm/compaction.c index 470474c..d24dd2d 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1003,7 +1003,7 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) switch (isolate_migratepages(zone, cc)) { case ISOLATE_ABORT: ret = COMPACT_PARTIAL; - putback_lru_pages(&cc->migratepages); + putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; goto out; case ISOLATE_NONE: @@ -1026,9 +1026,9 @@ static int compact_zone(struct zone *zone, struct compact_control *cc) trace_mm_compaction_migratepages(nr_migrate - nr_remaining, nr_remaining); - /* Release LRU pages not migrated */ + /* Release isolated pages not migrated */ if (err) { - putback_lru_pages(&cc->migratepages); + putback_movable_pages(&cc->migratepages); cc->nr_migratepages = 0; if (err == -ENOMEM) { ret = COMPACT_PARTIAL; diff --git a/mm/migrate.c b/mm/migrate.c index 427343c..3f675ca 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -80,6 +80,26 @@ void putback_lru_pages(struct list_head *l) list_del(&page->lru); dec_zone_page_state(page, NR_ISOLATED_ANON + page_is_file_cache(page)); + putback_lru_page(page); + } +} + +/* + * Put previously isolated pages back onto the appropriate lists + * from where they were once taken off for compaction/migration. + * + * This function shall be used instead of putback_lru_pages(), + * whenever the isolated pageset has been built by isolate_migratepages_range() + */ +void putback_movable_pages(struct list_head *l) +{ + struct page *page; + struct page *page2; + + list_for_each_entry_safe(page, page2, l, lru) { + list_del(&page->lru); + dec_zone_page_state(page, NR_ISOLATED_ANON + + page_is_file_cache(page)); if (unlikely(balloon_page_movable(page))) balloon_page_putback(page); else diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5a7b761..d9fbac1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5761,7 +5761,7 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, 0, false, MIGRATE_SYNC); } - putback_lru_pages(&cc->migratepages); + putback_movable_pages(&cc->migratepages); return ret > 0 ? 0 : ret; } -- cgit v1.1 From 6f6313d4870f9642cb3ea8ec892cf6da81331b9c Mon Sep 17 00:00:00 2001 From: Jeff Liu Date: Tue, 11 Dec 2012 16:02:48 -0800 Subject: mm/vmscan.c: try_to_freeze() returns boolean kswapd()->try_to_freeze() is defined to return a boolean, so it's better to use a bool to hold its return value. Signed-off-by: Jie Liu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 5394731..157bb11 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2963,7 +2963,7 @@ static int kswapd(void *p) classzone_idx = new_classzone_idx = pgdat->nr_zones - 1; balanced_classzone_idx = classzone_idx; for ( ; ; ) { - int ret; + bool ret; /* * If the last balance_pgdat was unsuccessful it's unlikely a -- cgit v1.1 From 212a0a6f28dda0a1e732d20d57abb465750d473c Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 11 Dec 2012 16:02:51 -0800 Subject: mm, mempolicy: remove duplicate code Remove some duplicate code and simplify alloc_pages_vma(). No functional change. Signed-off-by: David Rientjes Cc: KOSAKI Motohiro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4ea600d..05b2836 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1907,7 +1907,6 @@ alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, unsigned long addr, int node) { struct mempolicy *pol; - struct zonelist *zl; struct page *page; unsigned int cpuset_mems_cookie; @@ -1926,23 +1925,11 @@ retry_cpuset: return page; } - zl = policy_zonelist(gfp, pol, node); - if (unlikely(mpol_needs_cond_ref(pol))) { - /* - * slow path: ref counted shared policy - */ - struct page *page = __alloc_pages_nodemask(gfp, order, - zl, policy_nodemask(gfp, pol)); - __mpol_put(pol); - if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) - goto retry_cpuset; - return page; - } - /* - * fast path: default or task policy - */ - page = __alloc_pages_nodemask(gfp, order, zl, + page = __alloc_pages_nodemask(gfp, order, + policy_zonelist(gfp, pol, node), policy_nodemask(gfp, pol)); + if (unlikely(mpol_needs_cond_ref(pol))) + __mpol_put(pol); if (unlikely(!put_mems_allowed(cpuset_mems_cookie) && !page)) goto retry_cpuset; return page; -- cgit v1.1 From a9c58b907dbc6821533dfc295b63caf111ff1f16 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 11 Dec 2012 16:02:54 -0800 Subject: mm, oom: change type of oom_score_adj to short The maximum oom_score_adj is 1000 and the minimum oom_score_adj is -1000, so this range can be represented by the signed short type with no functional change. The extra space this frees up in struct signal_struct will be used for per-thread oom kill flags in the next patch. Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Reviewed-by: Michal Hocko Cc: Anton Vorontsov Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 2 +- mm/oom_kill.c | 10 +++++----- mm/swapfile.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 31ae5ea..b4d5a9d 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1919,7 +1919,7 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, if (ksm_run != flags) { ksm_run = flags; if (flags & KSM_RUN_UNMERGE) { - int oom_score_adj; + short oom_score_adj; oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); err = unmerge_and_remove_all_rmap_items(); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 7e9e911..37ab4c5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -53,7 +53,7 @@ static DEFINE_SPINLOCK(zone_scan_lock); * @old_val. Usually used to reinstate a previous value to prevent racing with * userspacing tuning the value in the interim. */ -void compare_swap_oom_score_adj(int old_val, int new_val) +void compare_swap_oom_score_adj(short old_val, short new_val) { struct sighand_struct *sighand = current->sighand; @@ -72,7 +72,7 @@ void compare_swap_oom_score_adj(int old_val, int new_val) * synchronization and returns the old value. Usually used to temporarily * set a value, save the old value in the caller, and then reinstate it later. */ -int test_set_oom_score_adj(int new_val) +short test_set_oom_score_adj(short new_val) { struct sighand_struct *sighand = current->sighand; int old_val; @@ -193,7 +193,7 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, if (!p) return 0; - adj = p->signal->oom_score_adj; + adj = (long)p->signal->oom_score_adj; if (adj == OOM_SCORE_ADJ_MIN) { task_unlock(p); return 0; @@ -399,7 +399,7 @@ static void dump_tasks(const struct mem_cgroup *memcg, const nodemask_t *nodemas continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5d %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %7lu %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), task->mm->nr_ptes, @@ -415,7 +415,7 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order, { task_lock(current); pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, " - "oom_score_adj=%d\n", + "oom_score_adj=%hd\n", current->comm, gfp_mask, order, current->signal->oom_score_adj); cpuset_print_task_mems_allowed(current); diff --git a/mm/swapfile.c b/mm/swapfile.c index 0fbb452..bb6f6a0 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1498,7 +1498,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct address_space *mapping; struct inode *inode; struct filename *pathname; - int oom_score_adj; + short oom_score_adj; int i, type, prev; int err; -- cgit v1.1 From e1e12d2f3104be886073ac6c5c4678f30b1b9e51 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Tue, 11 Dec 2012 16:02:56 -0800 Subject: mm, oom: fix race when specifying a thread as the oom origin test_set_oom_score_adj() and compare_swap_oom_score_adj() are used to specify that current should be killed first if an oom condition occurs in between the two calls. The usage is short oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); ... compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); to store the thread's oom_score_adj, temporarily change it to the maximum score possible, and then restore the old value if it is still the same. This happens to still be racy, however, if the user writes OOM_SCORE_ADJ_MAX to /proc/pid/oom_score_adj in between the two calls. The compare_swap_oom_score_adj() will then incorrectly reset the old value prior to the write of OOM_SCORE_ADJ_MAX. To fix this, introduce a new oom_flags_t member in struct signal_struct that will be used for per-thread oom killer flags. KSM and swapoff can now use a bit in this member to specify that threads should be killed first in oom conditions without playing around with oom_score_adj. This also allows the correct oom_score_adj to always be shown when reading /proc/pid/oom_score. Signed-off-by: David Rientjes Cc: KAMEZAWA Hiroyuki Cc: KOSAKI Motohiro Reviewed-by: Michal Hocko Cc: Anton Vorontsov Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 7 ++----- mm/oom_kill.c | 49 +++++++------------------------------------------ mm/swapfile.c | 5 ++--- 3 files changed, 11 insertions(+), 50 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index b4d5a9d..382d930 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1919,12 +1919,9 @@ static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, if (ksm_run != flags) { ksm_run = flags; if (flags & KSM_RUN_UNMERGE) { - short oom_score_adj; - - oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); + set_current_oom_origin(); err = unmerge_and_remove_all_rmap_items(); - compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, - oom_score_adj); + clear_current_oom_origin(); if (err) { ksm_run = KSM_RUN_STOP; count = err; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 37ab4c5..18f1ae2 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -44,48 +44,6 @@ int sysctl_oom_kill_allocating_task; int sysctl_oom_dump_tasks = 1; static DEFINE_SPINLOCK(zone_scan_lock); -/* - * compare_swap_oom_score_adj() - compare and swap current's oom_score_adj - * @old_val: old oom_score_adj for compare - * @new_val: new oom_score_adj for swap - * - * Sets the oom_score_adj value for current to @new_val iff its present value is - * @old_val. Usually used to reinstate a previous value to prevent racing with - * userspacing tuning the value in the interim. - */ -void compare_swap_oom_score_adj(short old_val, short new_val) -{ - struct sighand_struct *sighand = current->sighand; - - spin_lock_irq(&sighand->siglock); - if (current->signal->oom_score_adj == old_val) - current->signal->oom_score_adj = new_val; - trace_oom_score_adj_update(current); - spin_unlock_irq(&sighand->siglock); -} - -/** - * test_set_oom_score_adj() - set current's oom_score_adj and return old value - * @new_val: new oom_score_adj value - * - * Sets the oom_score_adj value for current to @new_val with proper - * synchronization and returns the old value. Usually used to temporarily - * set a value, save the old value in the caller, and then reinstate it later. - */ -short test_set_oom_score_adj(short new_val) -{ - struct sighand_struct *sighand = current->sighand; - int old_val; - - spin_lock_irq(&sighand->siglock); - old_val = current->signal->oom_score_adj; - current->signal->oom_score_adj = new_val; - trace_oom_score_adj_update(current); - spin_unlock_irq(&sighand->siglock); - - return old_val; -} - #ifdef CONFIG_NUMA /** * has_intersects_mems_allowed() - check task eligiblity for kill @@ -310,6 +268,13 @@ enum oom_scan_t oom_scan_process_thread(struct task_struct *task, if (!task->mm) return OOM_SCAN_CONTINUE; + /* + * If task is allocating a lot of memory and has been marked to be + * killed first if it triggers an oom, then select it. + */ + if (oom_task_origin(task)) + return OOM_SCAN_SELECT; + if (task->flags & PF_EXITING && !force_kill) { /* * If this task is not being ptraced on exit, then wait for it diff --git a/mm/swapfile.c b/mm/swapfile.c index bb6f6a0..e97a0e5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1498,7 +1498,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) struct address_space *mapping; struct inode *inode; struct filename *pathname; - short oom_score_adj; int i, type, prev; int err; @@ -1557,9 +1556,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) p->flags &= ~SWP_WRITEOK; spin_unlock(&swap_lock); - oom_score_adj = test_set_oom_score_adj(OOM_SCORE_ADJ_MAX); + set_current_oom_origin(); err = try_to_unuse(type, false, 0); /* force all pages to be unused */ - compare_swap_oom_score_adj(OOM_SCORE_ADJ_MAX, oom_score_adj); + clear_current_oom_origin(); if (err) { /* re-insert swap space back into swap_list */ -- cgit v1.1 From 2e30abd1730751d58463d88bc0844ab8fd7112a9 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Tue, 11 Dec 2012 16:02:57 -0800 Subject: mm: cma: skip watermarks check for already isolated blocks in split_free_page() Since commit 2139cbe627b8 ("cma: fix counting of isolated pages") free pages in isolated pageblocks are not accounted to NR_FREE_PAGES counters, so watermarks check is not required if one operates on a free page in isolated pageblock. Signed-off-by: Marek Szyprowski Cc: Kyungmin Park Cc: Arnd Bergmann Cc: Mel Gorman Acked-by: Michal Nazarewicz Cc: Minchan Kim Cc: Bartlomiej Zolnierkiewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d9fbac1..265fea4fb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1394,21 +1394,22 @@ int capture_free_page(struct page *page, int alloc_order, int migratetype) zone = page_zone(page); order = page_order(page); + mt = get_pageblock_migratetype(page); - /* Obey watermarks as if the page was being allocated */ - watermark = low_wmark_pages(zone) + (1 << order); - if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) - return 0; + if (mt != MIGRATE_ISOLATE) { + /* Obey watermarks as if the page was being allocated */ + watermark = low_wmark_pages(zone) + (1 << order); + if (!zone_watermark_ok(zone, 0, watermark, 0, 0)) + return 0; + + __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); + } /* Remove page from free list */ list_del(&page->lru); zone->free_area[order].nr_free--; rmv_page_order(page); - mt = get_pageblock_migratetype(page); - if (unlikely(mt != MIGRATE_ISOLATE)) - __mod_zone_freepage_state(zone, -(1UL << alloc_order), mt); - if (alloc_order != order) expand(zone, page, alloc_order, order, &zone->free_area[order], migratetype); -- cgit v1.1 From bc357f431c836c6631751e3ef7dfe7882394ad67 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Tue, 11 Dec 2012 16:02:59 -0800 Subject: mm: cma: remove watermark hacks Commits 2139cbe627b8 ("cma: fix counting of isolated pages") and d95ea5d18e69 ("cma: fix watermark checking") introduced a reliable method of free page accounting when memory is being allocated from CMA regions, so the workaround introduced earlier by commit 49f223a9cd96 ("mm: trigger page reclaim in alloc_contig_range() to stabilise watermarks") can be finally removed. Signed-off-by: Marek Szyprowski Cc: Kyungmin Park Cc: Arnd Bergmann Cc: Mel Gorman Acked-by: Michal Nazarewicz Cc: Minchan Kim Cc: Bartlomiej Zolnierkiewicz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 58 --------------------------------------------------------- 1 file changed, 58 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 265fea4fb..5a8d339 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5218,10 +5218,6 @@ static void __setup_per_zone_wmarks(void) zone->watermark[WMARK_LOW] = min_wmark_pages(zone) + (tmp >> 2); zone->watermark[WMARK_HIGH] = min_wmark_pages(zone) + (tmp >> 1); - zone->watermark[WMARK_MIN] += cma_wmark_pages(zone); - zone->watermark[WMARK_LOW] += cma_wmark_pages(zone); - zone->watermark[WMARK_HIGH] += cma_wmark_pages(zone); - setup_zone_migrate_reserve(zone); spin_unlock_irqrestore(&zone->lock, flags); } @@ -5766,54 +5762,6 @@ static int __alloc_contig_migrate_range(struct compact_control *cc, return ret > 0 ? 0 : ret; } -/* - * Update zone's cma pages counter used for watermark level calculation. - */ -static inline void __update_cma_watermarks(struct zone *zone, int count) -{ - unsigned long flags; - spin_lock_irqsave(&zone->lock, flags); - zone->min_cma_pages += count; - spin_unlock_irqrestore(&zone->lock, flags); - setup_per_zone_wmarks(); -} - -/* - * Trigger memory pressure bump to reclaim some pages in order to be able to - * allocate 'count' pages in single page units. Does similar work as - *__alloc_pages_slowpath() function. - */ -static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) -{ - enum zone_type high_zoneidx = gfp_zone(gfp_mask); - struct zonelist *zonelist = node_zonelist(0, gfp_mask); - int did_some_progress = 0; - int order = 1; - - /* - * Increase level of watermarks to force kswapd do his job - * to stabilise at new watermark level. - */ - __update_cma_watermarks(zone, count); - - /* Obey watermarks as if the page was being allocated */ - while (!zone_watermark_ok(zone, 0, low_wmark_pages(zone), 0, 0)) { - wake_all_kswapd(order, zonelist, high_zoneidx, zone_idx(zone)); - - did_some_progress = __perform_reclaim(gfp_mask, order, zonelist, - NULL); - if (!did_some_progress) { - /* Exhausted what can be done so it's blamo time */ - out_of_memory(zonelist, gfp_mask, order, NULL, false); - } - } - - /* Restore original watermark levels. */ - __update_cma_watermarks(zone, -count); - - return count; -} - /** * alloc_contig_range() -- tries to allocate given range of pages * @start: start PFN to allocate @@ -5837,7 +5785,6 @@ static int __reclaim_pages(struct zone *zone, gfp_t gfp_mask, int count) int alloc_contig_range(unsigned long start, unsigned long end, unsigned migratetype) { - struct zone *zone = page_zone(pfn_to_page(start)); unsigned long outer_start, outer_end; int ret = 0, order; @@ -5922,11 +5869,6 @@ int alloc_contig_range(unsigned long start, unsigned long end, goto done; } - /* - * Reclaim enough pages to make sure that contiguous allocation - * will not starve the system. - */ - __reclaim_pages(zone, GFP_HIGHUSER_MOVABLE, end-start); /* Grab isolated pages from freelists. */ outer_end = isolate_freepages_range(&cc, outer_start, end); -- cgit v1.1 From 81df9bff2609f07cef4690ac2ebda1611b55b05a Mon Sep 17 00:00:00 2001 From: Joonsoo Kim Date: Tue, 11 Dec 2012 16:03:10 -0800 Subject: bootmem: fix wrong call parameter for free_bootmem() It is strange that alloc_bootmem() returns a virtual address and free_bootmem() requires a physical address. Anyway, free_bootmem()'s first parameter should be physical address. There are some call sites for free_bootmem() with virtual address. So fix them. [akpm@linux-foundation.org: improve free_bootmem() and free_bootmem_pate() documentation] Signed-off-by: Joonsoo Kim Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Johannes Weiner Cc: FUJITA Tomonori Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/bootmem.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/bootmem.c b/mm/bootmem.c index f468185..ecc4595 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -147,21 +147,21 @@ unsigned long __init init_bootmem(unsigned long start, unsigned long pages) /* * free_bootmem_late - free bootmem pages directly to page allocator - * @addr: starting address of the range + * @addr: starting physical address of the range * @size: size of the range in bytes * * This is only useful when the bootmem allocator has already been torn * down, but we are still initializing the system. Pages are given directly * to the page allocator, no bootmem metadata is updated because it is gone. */ -void __init free_bootmem_late(unsigned long addr, unsigned long size) +void __init free_bootmem_late(unsigned long physaddr, unsigned long size) { unsigned long cursor, end; - kmemleak_free_part(__va(addr), size); + kmemleak_free_part(__va(physaddr), size); - cursor = PFN_UP(addr); - end = PFN_DOWN(addr + size); + cursor = PFN_UP(physaddr); + end = PFN_DOWN(physaddr + size); for (; cursor < end; cursor++) { __free_pages_bootmem(pfn_to_page(cursor), 0); @@ -377,21 +377,21 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr, /** * free_bootmem - mark a page range as usable - * @addr: starting address of the range + * @addr: starting physical address of the range * @size: size of the range in bytes * * Partial pages will be considered reserved and left as they are. * * The range must be contiguous but may span node boundaries. */ -void __init free_bootmem(unsigned long addr, unsigned long size) +void __init free_bootmem(unsigned long physaddr, unsigned long size) { unsigned long start, end; - kmemleak_free_part(__va(addr), size); + kmemleak_free_part(__va(physaddr), size); - start = PFN_UP(addr); - end = PFN_DOWN(addr + size); + start = PFN_UP(physaddr); + end = PFN_DOWN(physaddr + size); mark_bootmem(start, end, 0, 0); } -- cgit v1.1 From 511c2aba8f07fc45bdcba548cb63f7b8a450c6dc Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 11 Dec 2012 16:03:16 -0800 Subject: mm, memory-hotplug: dynamic configure movable memory and portion memory Add online_movable and online_kernel for logic memory hotplug. This is the dynamic version of "movablecore" & "kernelcore". We have the same reason to introduce it as to introduce "movablecore" & "kernelcore". It has the same motive as "movablecore" & "kernelcore", but it is dynamic/running-time: o We can configure memory as kernelcore or movablecore after boot. Userspace workload is increased, we need more hugepage, we can't use "online_movable" to add memory and allow the system use more THP(transparent-huge-page), vice-verse when kernel workload is increase. Also help for virtualization to dynamic configure host/guest's memory, to save/(reduce waste) memory. Memory capacity on Demand o When a new node is physically online after boot, we need to use "online_movable" or "online_kernel" to configure/portion it as we expected when we logic-online it. This configuration also helps for physically-memory-migrate. o all benefit as the same as existed "movablecore" & "kernelcore". o Preparing for movable-node, which is very important for power-saving, hardware partitioning and high-available-system(hardware fault management). (Note, we don't introduce movable-node here.) Action behavior: When a memoryblock/memorysection is onlined by "online_movable", the kernel will not have directly reference to the page of the memoryblock, thus we can remove that memory any time when needed. When it is online by "online_kernel", the kernel can use it. When it is online by "online", the zone type doesn't changed. Current constraints: Only the memoryblock which is adjacent to the ZONE_MOVABLE can be online from ZONE_NORMAL to ZONE_MOVABLE. [akpm@linux-foundation.org: use min_t, cleanups] Signed-off-by: Lai Jiangshan Signed-off-by: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Lai Jiangshan Cc: Jiang Liu Cc: KOSAKI Motohiro Cc: Minchan Kim Cc: Mel Gorman Cc: David Rientjes Cc: Yinghai Lu Cc: Rusty Russell Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 100 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 99 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 571130e..5c1f495 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -214,6 +214,88 @@ static void grow_zone_span(struct zone *zone, unsigned long start_pfn, zone_span_writeunlock(zone); } +static void resize_zone(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) +{ + zone_span_writelock(zone); + + zone->zone_start_pfn = start_pfn; + zone->spanned_pages = end_pfn - start_pfn; + + zone_span_writeunlock(zone); +} + +static void fix_zone_id(struct zone *zone, unsigned long start_pfn, + unsigned long end_pfn) +{ + enum zone_type zid = zone_idx(zone); + int nid = zone->zone_pgdat->node_id; + unsigned long pfn; + + for (pfn = start_pfn; pfn < end_pfn; pfn++) + set_page_links(pfn_to_page(pfn), zid, nid, pfn); +} + +static int move_pfn_range_left(struct zone *z1, struct zone *z2, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long flags; + + pgdat_resize_lock(z1->zone_pgdat, &flags); + + /* can't move pfns which are higher than @z2 */ + if (end_pfn > z2->zone_start_pfn + z2->spanned_pages) + goto out_fail; + /* the move out part mast at the left most of @z2 */ + if (start_pfn > z2->zone_start_pfn) + goto out_fail; + /* must included/overlap */ + if (end_pfn <= z2->zone_start_pfn) + goto out_fail; + + resize_zone(z1, z1->zone_start_pfn, end_pfn); + resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); + + pgdat_resize_unlock(z1->zone_pgdat, &flags); + + fix_zone_id(z1, start_pfn, end_pfn); + + return 0; +out_fail: + pgdat_resize_unlock(z1->zone_pgdat, &flags); + return -1; +} + +static int move_pfn_range_right(struct zone *z1, struct zone *z2, + unsigned long start_pfn, unsigned long end_pfn) +{ + unsigned long flags; + + pgdat_resize_lock(z1->zone_pgdat, &flags); + + /* can't move pfns which are lower than @z1 */ + if (z1->zone_start_pfn > start_pfn) + goto out_fail; + /* the move out part mast at the right most of @z1 */ + if (z1->zone_start_pfn + z1->spanned_pages > end_pfn) + goto out_fail; + /* must included/overlap */ + if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) + goto out_fail; + + resize_zone(z1, z1->zone_start_pfn, start_pfn); + resize_zone(z2, start_pfn, z2->zone_start_pfn + z2->spanned_pages); + + pgdat_resize_unlock(z1->zone_pgdat, &flags); + + fix_zone_id(z2, start_pfn, end_pfn); + + return 0; +out_fail: + pgdat_resize_unlock(z1->zone_pgdat, &flags); + return -1; +} + static void grow_pgdat_span(struct pglist_data *pgdat, unsigned long start_pfn, unsigned long end_pfn) { @@ -508,7 +590,7 @@ static void node_states_set_node(int node, struct memory_notify *arg) } -int __ref online_pages(unsigned long pfn, unsigned long nr_pages) +int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { unsigned long onlined_pages = 0; struct zone *zone; @@ -525,6 +607,22 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages) */ zone = page_zone(pfn_to_page(pfn)); + if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { + if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { + unlock_memory_hotplug(); + return -1; + } + } + if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { + if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { + unlock_memory_hotplug(); + return -1; + } + } + + /* Previous code may changed the zone of the pfn range */ + zone = page_zone(pfn_to_page(pfn)); + arg.start_pfn = pfn; arg.nr_pages = nr_pages; node_states_check_changes_online(nr_pages, zone, &arg); -- cgit v1.1 From e455a9b92d6e19a3f0f7eb6f6241efa566a7e81a Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 11 Dec 2012 16:03:20 -0800 Subject: memory_hotplug: handle empty zone when online_movable/online_kernel Make online_movable/online_kernel can empty a zone or can move memory to a empty zone. Signed-off-by: Lai Jiangshan Signed-off-by: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Lai Jiangshan Cc: Jiang Liu Cc: KOSAKI Motohiro Cc: Minchan Kim Cc: Mel Gorman Cc: David Rientjes Cc: Yinghai Lu Cc: Rusty Russell Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 51 +++++++++++++++++++++++++++++++++++++++++++++------ 1 file changed, 45 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 5c1f495..c370491 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -219,8 +219,17 @@ static void resize_zone(struct zone *zone, unsigned long start_pfn, { zone_span_writelock(zone); - zone->zone_start_pfn = start_pfn; - zone->spanned_pages = end_pfn - start_pfn; + if (end_pfn - start_pfn) { + zone->zone_start_pfn = start_pfn; + zone->spanned_pages = end_pfn - start_pfn; + } else { + /* + * make it consist as free_area_init_core(), + * if spanned_pages = 0, then keep start_pfn = 0 + */ + zone->zone_start_pfn = 0; + zone->spanned_pages = 0; + } zone_span_writeunlock(zone); } @@ -236,10 +245,19 @@ static void fix_zone_id(struct zone *zone, unsigned long start_pfn, set_page_links(pfn_to_page(pfn), zid, nid, pfn); } -static int move_pfn_range_left(struct zone *z1, struct zone *z2, +static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, unsigned long start_pfn, unsigned long end_pfn) { + int ret; unsigned long flags; + unsigned long z1_start_pfn; + + if (!z1->wait_table) { + ret = init_currently_empty_zone(z1, start_pfn, + end_pfn - start_pfn, MEMMAP_HOTPLUG); + if (ret) + return ret; + } pgdat_resize_lock(z1->zone_pgdat, &flags); @@ -253,7 +271,13 @@ static int move_pfn_range_left(struct zone *z1, struct zone *z2, if (end_pfn <= z2->zone_start_pfn) goto out_fail; - resize_zone(z1, z1->zone_start_pfn, end_pfn); + /* use start_pfn for z1's start_pfn if z1 is empty */ + if (z1->spanned_pages) + z1_start_pfn = z1->zone_start_pfn; + else + z1_start_pfn = start_pfn; + + resize_zone(z1, z1_start_pfn, end_pfn); resize_zone(z2, end_pfn, z2->zone_start_pfn + z2->spanned_pages); pgdat_resize_unlock(z1->zone_pgdat, &flags); @@ -266,10 +290,19 @@ out_fail: return -1; } -static int move_pfn_range_right(struct zone *z1, struct zone *z2, +static int __meminit move_pfn_range_right(struct zone *z1, struct zone *z2, unsigned long start_pfn, unsigned long end_pfn) { + int ret; unsigned long flags; + unsigned long z2_end_pfn; + + if (!z2->wait_table) { + ret = init_currently_empty_zone(z2, start_pfn, + end_pfn - start_pfn, MEMMAP_HOTPLUG); + if (ret) + return ret; + } pgdat_resize_lock(z1->zone_pgdat, &flags); @@ -283,8 +316,14 @@ static int move_pfn_range_right(struct zone *z1, struct zone *z2, if (start_pfn >= z1->zone_start_pfn + z1->spanned_pages) goto out_fail; + /* use end_pfn for z2's end_pfn if z2 is empty */ + if (z2->spanned_pages) + z2_end_pfn = z2->zone_start_pfn + z2->spanned_pages; + else + z2_end_pfn = end_pfn; + resize_zone(z1, z1->zone_start_pfn, start_pfn); - resize_zone(z2, start_pfn, z2->zone_start_pfn + z2->spanned_pages); + resize_zone(z2, start_pfn, z2_end_pfn); pgdat_resize_unlock(z1->zone_pgdat, &flags); -- cgit v1.1 From 74d42d8fe146e870c52bde3b1c692f86cc8ff844 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 11 Dec 2012 16:03:23 -0800 Subject: memory_hotplug: ensure every online node has NORMAL memory Old memory hotplug code and new online/movable may cause a online node don't have any normal memory, but memory-management acts bad when we have nodes which is online but don't have any normal memory. Example: it may cause a bound task fail on all kernel allocation and cause the task can't create task or create other kernel object. So we disable non-normal-memory-node here, we will enable it when we prepared. Signed-off-by: Lai Jiangshan Signed-off-by: Wen Congyang Cc: Yasuaki Ishimatsu Cc: Lai Jiangshan Cc: Jiang Liu Cc: KOSAKI Motohiro Cc: Minchan Kim Cc: Mel Gorman Cc: David Rientjes Cc: Yinghai Lu Cc: Rusty Russell Cc: Greg KH Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 40 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) (limited to 'mm') diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c370491..de9cb14 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -581,6 +581,12 @@ static int online_pages_range(unsigned long start_pfn, unsigned long nr_pages, return 0; } +/* ensure every online node has NORMAL memory */ +static bool can_online_high_movable(struct zone *zone) +{ + return node_state(zone_to_nid(zone), N_NORMAL_MEMORY); +} + /* check which state of node_states will be changed when online memory */ static void node_states_check_changes_online(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) @@ -646,6 +652,12 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ */ zone = page_zone(pfn_to_page(pfn)); + if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && + !can_online_high_movable(zone)) { + unlock_memory_hotplug(); + return -1; + } + if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { unlock_memory_hotplug(); @@ -1054,6 +1066,30 @@ check_pages_isolated(unsigned long start_pfn, unsigned long end_pfn) return offlined; } +/* ensure the node has NORMAL memory if it is still online */ +static bool can_offline_normal(struct zone *zone, unsigned long nr_pages) +{ + struct pglist_data *pgdat = zone->zone_pgdat; + unsigned long present_pages = 0; + enum zone_type zt; + + for (zt = 0; zt <= ZONE_NORMAL; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + + if (present_pages > nr_pages) + return true; + + present_pages = 0; + for (; zt <= ZONE_MOVABLE; zt++) + present_pages += pgdat->node_zones[zt].present_pages; + + /* + * we can't offline the last normal memory until all + * higher memory is offlined. + */ + return present_pages == 0; +} + /* check which state of node_states will be changed when offline memory */ static void node_states_check_changes_offline(unsigned long nr_pages, struct zone *zone, struct memory_notify *arg) @@ -1141,6 +1177,10 @@ static int __ref __offline_pages(unsigned long start_pfn, node = zone_to_nid(zone); nr_pages = end_pfn - start_pfn; + ret = -EINVAL; + if (zone_idx(zone) <= ZONE_NORMAL && !can_offline_normal(zone, nr_pages)) + goto out; + /* set above range as isolated */ ret = start_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE, true); -- cgit v1.1