diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/Kconfig | 42 | ||||
-rw-r--r-- | mm/Makefile | 2 | ||||
-rw-r--r-- | mm/backing-dev.c | 5 | ||||
-rw-r--r-- | mm/bootmem.c | 39 | ||||
-rw-r--r-- | mm/filemap.c | 6 | ||||
-rw-r--r-- | mm/huge_memory.c | 30 | ||||
-rw-r--r-- | mm/hugetlb.c | 4 | ||||
-rw-r--r-- | mm/internal.h | 5 | ||||
-rw-r--r-- | mm/memblock.c | 2 | ||||
-rw-r--r-- | mm/memcontrol.c | 363 | ||||
-rw-r--r-- | mm/memory-failure.c | 22 | ||||
-rw-r--r-- | mm/memory.c | 15 | ||||
-rw-r--r-- | mm/memory_hotplug.c | 139 | ||||
-rw-r--r-- | mm/mm_init.c | 47 | ||||
-rw-r--r-- | mm/mmap.c | 40 | ||||
-rw-r--r-- | mm/mmu_notifier.c | 2 | ||||
-rw-r--r-- | mm/mremap.c | 20 | ||||
-rw-r--r-- | mm/nobootmem.c | 35 | ||||
-rw-r--r-- | mm/nommu.c | 10 | ||||
-rw-r--r-- | mm/page_alloc.c | 384 | ||||
-rw-r--r-- | mm/page_io.c | 50 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 5 | ||||
-rw-r--r-- | mm/rmap.c | 9 | ||||
-rw-r--r-- | mm/shmem.c | 16 | ||||
-rw-r--r-- | mm/slab.c | 51 | ||||
-rw-r--r-- | mm/slab.h | 3 | ||||
-rw-r--r-- | mm/slab_common.c | 18 | ||||
-rw-r--r-- | mm/slob.c | 4 | ||||
-rw-r--r-- | mm/slub.c | 38 | ||||
-rw-r--r-- | mm/sparse.c | 8 | ||||
-rw-r--r-- | mm/swap.c | 106 | ||||
-rw-r--r-- | mm/swapfile.c | 55 | ||||
-rw-r--r-- | mm/util.c | 1 | ||||
-rw-r--r-- | mm/vmalloc.c | 164 | ||||
-rw-r--r-- | mm/vmscan.c | 605 | ||||
-rw-r--r-- | mm/zbud.c | 527 | ||||
-rw-r--r-- | mm/zswap.c | 943 |
37 files changed, 2862 insertions, 953 deletions
@@ -501,3 +501,45 @@ config CMA_DEBUG messages for every CMA call as well as various messages while processing calls such as dma_alloc_from_contiguous(). This option does not affect warning and error messages. + +config ZBUD + tristate + default n + help + A special purpose allocator for storing compressed pages. + It is designed to store up to two compressed pages per physical + page. While this design limits storage density, it has simple and + deterministic reclaim properties that make it preferable to a higher + density approach when reclaim will be used. + +config ZSWAP + bool "Compressed cache for swap pages (EXPERIMENTAL)" + depends on FRONTSWAP && CRYPTO=y + select CRYPTO_LZO + select ZBUD + default n + help + A lightweight compressed cache for swap pages. It takes + pages that are in the process of being swapped out and attempts to + compress them into a dynamically allocated RAM-based memory pool. + This can result in a significant I/O reduction on swap device and, + in the case where decompressing from RAM is faster that swap device + reads, can also improve workload performance. + + This is marked experimental because it is a new feature (as of + v3.11) that interacts heavily with memory reclaim. While these + interactions don't cause any known issues on simple memory setups, + they have not be fully explored on the large set of potential + configurations and workloads that exist. + +config MEM_SOFT_DIRTY + bool "Track memory changes" + depends on CHECKPOINT_RESTORE && HAVE_ARCH_SOFT_DIRTY + select PROC_PAGE_MONITOR + help + This option enables memory changes tracking by introducing a + soft-dirty bit on pte-s. This bit it set when someone writes + into a page just as regular dirty bit, but unlike the latter + it can be cleared by hands. + + See Documentation/vm/soft-dirty.txt for more details. diff --git a/mm/Makefile b/mm/Makefile index 72c5acb..f008033 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -32,6 +32,7 @@ obj-$(CONFIG_HAVE_MEMBLOCK) += memblock.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_SWAP) += page_io.o swap_state.o swapfile.o obj-$(CONFIG_FRONTSWAP) += frontswap.o +obj-$(CONFIG_ZSWAP) += zswap.o obj-$(CONFIG_HAS_DMA) += dmapool.o obj-$(CONFIG_HUGETLBFS) += hugetlb.o obj-$(CONFIG_NUMA) += mempolicy.o @@ -58,3 +59,4 @@ obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o obj-$(CONFIG_CLEANCACHE) += cleancache.o obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o +obj-$(CONFIG_ZBUD) += zbud.o diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 5025174..d014ee5 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -515,7 +515,6 @@ EXPORT_SYMBOL(bdi_destroy); int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, unsigned int cap) { - char tmp[32]; int err; bdi->name = name; @@ -524,8 +523,8 @@ int bdi_setup_and_register(struct backing_dev_info *bdi, char *name, if (err) return err; - sprintf(tmp, "%.28s%s", name, "-%d"); - err = bdi_register(bdi, NULL, tmp, atomic_long_inc_return(&bdi_seq)); + err = bdi_register(bdi, NULL, "%.28s-%ld", name, + atomic_long_inc_return(&bdi_seq)); if (err) { bdi_destroy(bdi); return err; diff --git a/mm/bootmem.c b/mm/bootmem.c index 2b0bcb0..6ab7744 100644 --- a/mm/bootmem.c +++ b/mm/bootmem.c @@ -241,33 +241,26 @@ static unsigned long __init free_all_bootmem_core(bootmem_data_t *bdata) return count; } -static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) +static int reset_managed_pages_done __initdata; + +static inline void __init reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; - /* - * In free_area_init_core(), highmem zone's managed_pages is set to - * present_pages, and bootmem allocator doesn't allocate from highmem - * zones. So there's no need to recalculate managed_pages because all - * highmem pages will be managed by the buddy system. Here highmem - * zone also includes highmem movable zone. - */ + if (reset_managed_pages_done) + return; + for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) - if (!is_highmem(z)) - z->managed_pages = 0; + z->managed_pages = 0; } -/** - * free_all_bootmem_node - release a node's free pages to the buddy allocator - * @pgdat: node to be released - * - * Returns the number of pages actually released. - */ -unsigned long __init free_all_bootmem_node(pg_data_t *pgdat) +void __init reset_all_zones_managed_pages(void) { - register_page_bootmem_info_node(pgdat); - reset_node_lowmem_managed_pages(pgdat); - return free_all_bootmem_core(pgdat->bdata); + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) + reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } /** @@ -279,14 +272,14 @@ unsigned long __init free_all_bootmem(void) { unsigned long total_pages = 0; bootmem_data_t *bdata; - struct pglist_data *pgdat; - for_each_online_pgdat(pgdat) - reset_node_lowmem_managed_pages(pgdat); + reset_all_zones_managed_pages(); list_for_each_entry(bdata, &bdata_list, list) total_pages += free_all_bootmem_core(bdata); + totalram_pages += total_pages; + return total_pages; } diff --git a/mm/filemap.c b/mm/filemap.c index 7905fe7..4b51ac1 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1539,12 +1539,12 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma, struct address_space *mapping = file->f_mapping; /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(vma)) + if (vma->vm_flags & VM_RAND_READ) return; if (!ra->ra_pages) return; - if (VM_SequentialReadHint(vma)) { + if (vma->vm_flags & VM_SEQ_READ) { page_cache_sync_readahead(mapping, ra, file, offset, ra->ra_pages); return; @@ -1584,7 +1584,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, struct address_space *mapping = file->f_mapping; /* If we don't want any read-ahead, don't bother */ - if (VM_RandomReadHint(vma)) + if (vma->vm_flags & VM_RAND_READ) return; if (ra->mmap_miss > 0) ra->mmap_miss--; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 362c329..243e710 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -729,8 +729,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, pmd_t entry; entry = mk_huge_pmd(page, vma); page_add_new_anon_rmap(page, vma, haddr); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - pgtable_trans_huge_deposit(mm, pgtable); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); mm->nr_ptes++; spin_unlock(&mm->page_table_lock); @@ -771,8 +771,8 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_wrprotect(entry); entry = pmd_mkhuge(entry); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); - pgtable_trans_huge_deposit(mm, pgtable); mm->nr_ptes++; return true; } @@ -916,8 +916,8 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmdp_set_wrprotect(src_mm, addr, src_pmd); pmd = pmd_mkold(pmd_wrprotect(pmd)); + pgtable_trans_huge_deposit(dst_mm, dst_pmd, pgtable); set_pmd_at(dst_mm, addr, dst_pmd, pmd); - pgtable_trans_huge_deposit(dst_mm, pgtable); dst_mm->nr_ptes++; ret = 0; @@ -987,7 +987,7 @@ static int do_huge_pmd_wp_zero_page_fallback(struct mm_struct *mm, pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -1085,7 +1085,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -1265,7 +1265,9 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, * young bit, instead of the current set_pmd_at. */ _pmd = pmd_mkyoung(pmd_mkdirty(*pmd)); - set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd); + if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK, + pmd, _pmd, 1)) + update_mmu_cache_pmd(vma, addr, pmd); } if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) { if (page->mapping && trylock_page(page)) { @@ -1358,9 +1360,15 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct page *page; pgtable_t pgtable; pmd_t orig_pmd; - pgtable = pgtable_trans_huge_withdraw(tlb->mm); + /* + * For architectures like ppc64 we look at deposited pgtable + * when calling pmdp_get_and_clear. So do the + * pgtable_trans_huge_withdraw after finishing pmdp related + * operations. + */ orig_pmd = pmdp_get_and_clear(tlb->mm, addr, pmd); tlb_remove_pmd_tlb_entry(tlb, pmd, addr); + pgtable = pgtable_trans_huge_withdraw(tlb->mm, pmd); if (is_huge_zero_pmd(orig_pmd)) { tlb->mm->nr_ptes--; spin_unlock(&tlb->mm->page_table_lock); @@ -1429,7 +1437,7 @@ int move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, if (ret == 1) { pmd = pmdp_get_and_clear(mm, old_addr, old_pmd); VM_BUG_ON(!pmd_none(*new_pmd)); - set_pmd_at(mm, new_addr, new_pmd, pmd); + set_pmd_at(mm, new_addr, new_pmd, pmd_mksoft_dirty(pmd)); spin_unlock(&mm->page_table_lock); } out: @@ -1691,7 +1699,7 @@ static int __split_huge_page_map(struct page *page, pmd = page_check_address_pmd(page, mm, address, PAGE_CHECK_ADDRESS_PMD_SPLITTING_FLAG); if (pmd) { - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); haddr = address; @@ -2359,9 +2367,9 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(&mm->page_table_lock); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); - pgtable_trans_huge_deposit(mm, pgtable); spin_unlock(&mm->page_table_lock); *hpage = NULL; @@ -2667,7 +2675,7 @@ static void __split_huge_zero_page_pmd(struct vm_area_struct *vma, pmdp_clear_flush(vma, haddr, pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(mm); + pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { diff --git a/mm/hugetlb.c b/mm/hugetlb.c index aed085a..83aff0a 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -319,7 +319,7 @@ unsigned long vma_kernel_pagesize(struct vm_area_struct *vma) hstate = hstate_vma(vma); - return 1UL << (hstate->order + PAGE_SHIFT); + return 1UL << huge_page_shift(hstate); } EXPORT_SYMBOL_GPL(vma_kernel_pagesize); @@ -1263,7 +1263,7 @@ static void __init gather_bootmem_prealloc(void) * side-effects, like CommitLimit going negative. */ if (h->order > (MAX_ORDER - 1)) - totalram_pages += 1 << h->order; + adjust_managed_page_count(page, 1 << h->order); } } diff --git a/mm/internal.h b/mm/internal.h index 8562de0..4390ac6 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -32,11 +32,6 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } -static inline void __put_page(struct page *page) -{ - atomic_dec(&page->_count); -} - static inline void __get_page_tail_foll(struct page *page, bool get_page_head) { diff --git a/mm/memblock.c b/mm/memblock.c index c5fad93..a847bfe6 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -566,7 +566,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size) /** * __next_free_mem_range - next function for for_each_free_mem_range() * @idx: pointer to u64 loop variable - * @nid: nid: node selector, %MAX_NUMNODES for all nodes + * @nid: node selector, %MAX_NUMNODES for all nodes * @out_start: ptr to phys_addr_t for start address of the range, can be %NULL * @out_end: ptr to phys_addr_t for end address of the range, can be %NULL * @out_nid: ptr to int for nid of the range, can be %NULL diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1947218..d12ca6f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -187,10 +187,6 @@ struct mem_cgroup_per_node { struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; }; -struct mem_cgroup_lru_info { - struct mem_cgroup_per_node *nodeinfo[0]; -}; - /* * Cgroups above their limits are maintained in a RB-Tree, independent of * their hierarchy representation @@ -267,28 +263,10 @@ struct mem_cgroup { /* vmpressure notifications */ struct vmpressure vmpressure; - union { - /* - * the counter to account for mem+swap usage. - */ - struct res_counter memsw; - - /* - * rcu_freeing is used only when freeing struct mem_cgroup, - * so put it into a union to avoid wasting more memory. - * It must be disjoint from the css field. It could be - * in a union with the res field, but res plays a much - * larger part in mem_cgroup life than memsw, and might - * be of interest, even at time of free, when debugging. - * So share rcu_head with the less interesting memsw. - */ - struct rcu_head rcu_freeing; - /* - * We also need some space for a worker in deferred freeing. - * By the time we call it, rcu_freeing is no longer in use. - */ - struct work_struct work_freeing; - }; + /* + * the counter to account for mem+swap usage. + */ + struct res_counter memsw; /* * the counter to account for kernel memory usage. @@ -303,8 +281,6 @@ struct mem_cgroup { bool oom_lock; atomic_t under_oom; - atomic_t refcnt; - int swappiness; /* OOM-Killer disable */ int oom_kill_disable; @@ -366,14 +342,8 @@ struct mem_cgroup { atomic_t numainfo_updating; #endif - /* - * Per cgroup active and inactive list, similar to the - * per zone LRU lists. - * - * WARNING: This has to be the last element of the struct. Don't - * add new fields after this point. - */ - struct mem_cgroup_lru_info info; + struct mem_cgroup_per_node *nodeinfo[0]; + /* WARNING: nodeinfo must be the last member here */ }; static size_t memcg_size(void) @@ -416,6 +386,11 @@ static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) { + /* + * Our caller must use css_get() first, because memcg_uncharge_kmem() + * will call css_put() if it sees the memcg is dead. + */ + smp_wmb(); if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); } @@ -508,9 +483,6 @@ enum res_type { */ static DEFINE_MUTEX(memcg_create_mutex); -static void mem_cgroup_get(struct mem_cgroup *memcg); -static void mem_cgroup_put(struct mem_cgroup *memcg); - static inline struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) { @@ -561,15 +533,15 @@ void sock_update_memcg(struct sock *sk) */ if (sk->sk_cgrp) { BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); - mem_cgroup_get(sk->sk_cgrp->memcg); + css_get(&sk->sk_cgrp->memcg->css); return; } rcu_read_lock(); memcg = mem_cgroup_from_task(current); cg_proto = sk->sk_prot->proto_cgroup(memcg); - if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) { - mem_cgroup_get(memcg); + if (!mem_cgroup_is_root(memcg) && + memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) { sk->sk_cgrp = cg_proto; } rcu_read_unlock(); @@ -583,7 +555,7 @@ void sock_release_memcg(struct sock *sk) struct mem_cgroup *memcg; WARN_ON(!sk->sk_cgrp->memcg); memcg = sk->sk_cgrp->memcg; - mem_cgroup_put(memcg); + css_put(&sk->sk_cgrp->memcg->css); } } @@ -683,7 +655,7 @@ static struct mem_cgroup_per_zone * mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) { VM_BUG_ON((unsigned)nid >= nr_node_ids); - return &memcg->info.nodeinfo[nid]->zoneinfo[zid]; + return &memcg->nodeinfo[nid]->zoneinfo[zid]; } struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) @@ -1148,6 +1120,58 @@ skip_node: return NULL; } +static void mem_cgroup_iter_invalidate(struct mem_cgroup *root) +{ + /* + * When a group in the hierarchy below root is destroyed, the + * hierarchy iterator can no longer be trusted since it might + * have pointed to the destroyed group. Invalidate it. + */ + atomic_inc(&root->dead_count); +} + +static struct mem_cgroup * +mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, + struct mem_cgroup *root, + int *sequence) +{ + struct mem_cgroup *position = NULL; + /* + * A cgroup destruction happens in two stages: offlining and + * release. They are separated by a RCU grace period. + * + * If the iterator is valid, we may still race with an + * offlining. The RCU lock ensures the object won't be + * released, tryget will fail if we lost the race. + */ + *sequence = atomic_read(&root->dead_count); + if (iter->last_dead_count == *sequence) { + smp_rmb(); + position = iter->last_visited; + if (position && !css_tryget(&position->css)) + position = NULL; + } + return position; +} + +static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, + struct mem_cgroup *last_visited, + struct mem_cgroup *new_position, + int sequence) +{ + if (last_visited) + css_put(&last_visited->css); + /* + * We store the sequence count from the time @last_visited was + * loaded successfully instead of rereading it here so that we + * don't lose destruction events in between. We could have + * raced with the destruction of @new_position after all. + */ + iter->last_visited = new_position; + smp_wmb(); + iter->last_dead_count = sequence; +} + /** * mem_cgroup_iter - iterate over memory cgroup hierarchy * @root: hierarchy root @@ -1171,7 +1195,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, { struct mem_cgroup *memcg = NULL; struct mem_cgroup *last_visited = NULL; - unsigned long uninitialized_var(dead_count); if (mem_cgroup_disabled()) return NULL; @@ -1191,6 +1214,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, rcu_read_lock(); while (!memcg) { struct mem_cgroup_reclaim_iter *uninitialized_var(iter); + int uninitialized_var(seq); if (reclaim) { int nid = zone_to_nid(reclaim->zone); @@ -1204,37 +1228,13 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, goto out_unlock; } - /* - * If the dead_count mismatches, a destruction - * has happened or is happening concurrently. - * If the dead_count matches, a destruction - * might still happen concurrently, but since - * we checked under RCU, that destruction - * won't free the object until we release the - * RCU reader lock. Thus, the dead_count - * check verifies the pointer is still valid, - * css_tryget() verifies the cgroup pointed to - * is alive. - */ - dead_count = atomic_read(&root->dead_count); - if (dead_count == iter->last_dead_count) { - smp_rmb(); - last_visited = iter->last_visited; - if (last_visited && - !css_tryget(&last_visited->css)) - last_visited = NULL; - } + last_visited = mem_cgroup_iter_load(iter, root, &seq); } memcg = __mem_cgroup_iter_next(root, last_visited); if (reclaim) { - if (last_visited) - css_put(&last_visited->css); - - iter->last_visited = memcg; - smp_wmb(); - iter->last_dead_count = dead_count; + mem_cgroup_iter_update(iter, last_visited, memcg, seq); if (!memcg) iter->generation++; @@ -1448,11 +1448,12 @@ static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, return ret; } -int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) +bool task_in_mem_cgroup(struct task_struct *task, + const struct mem_cgroup *memcg) { - int ret; struct mem_cgroup *curr = NULL; struct task_struct *p; + bool ret; p = find_lock_task_mm(task); if (p) { @@ -1464,14 +1465,14 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) * killer still needs to detect if they have already been oom * killed to prevent needlessly killing additional tasks. */ - task_lock(task); + rcu_read_lock(); curr = mem_cgroup_from_task(task); if (curr) css_get(&curr->css); - task_unlock(task); + rcu_read_unlock(); } if (!curr) - return 0; + return false; /* * We should check use_hierarchy of "memcg" not "curr". Because checking * use_hierarchy of "curr" here make this function true if hierarchy is @@ -3031,8 +3032,16 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) if (res_counter_uncharge(&memcg->kmem, size)) return; + /* + * Releases a reference taken in kmem_cgroup_css_offline in case + * this last uncharge is racing with the offlining code or it is + * outliving the memcg existence. + * + * The memory barrier imposed by test&clear is paired with the + * explicit one in memcg_kmem_mark_dead(). + */ if (memcg_kmem_test_and_clear_dead(memcg)) - mem_cgroup_put(memcg); + css_put(&memcg->css); } void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) @@ -3223,7 +3232,7 @@ void memcg_release_cache(struct kmem_cache *s) list_del(&s->memcg_params->list); mutex_unlock(&memcg->slab_caches_mutex); - mem_cgroup_put(memcg); + css_put(&memcg->css); out: kfree(s->memcg_params); } @@ -3383,16 +3392,18 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, mutex_lock(&memcg_cache_mutex); new_cachep = cachep->memcg_params->memcg_caches[idx]; - if (new_cachep) + if (new_cachep) { + css_put(&memcg->css); goto out; + } new_cachep = kmem_cache_dup(memcg, cachep); if (new_cachep == NULL) { new_cachep = cachep; + css_put(&memcg->css); goto out; } - mem_cgroup_get(memcg); atomic_set(&new_cachep->memcg_params->nr_pages , 0); cachep->memcg_params->memcg_caches[idx] = new_cachep; @@ -3480,8 +3491,6 @@ static void memcg_create_cache_work_func(struct work_struct *w) cw = container_of(w, struct create_work, work); memcg_create_kmem_cache(cw->memcg, cw->cachep); - /* Drop the reference gotten when we enqueued. */ - css_put(&cw->memcg->css); kfree(cw); } @@ -3618,6 +3627,34 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) int ret; *_memcg = NULL; + + /* + * Disabling accounting is only relevant for some specific memcg + * internal allocations. Therefore we would initially not have such + * check here, since direct calls to the page allocator that are marked + * with GFP_KMEMCG only happen outside memcg core. We are mostly + * concerned with cache allocations, and by having this test at + * memcg_kmem_get_cache, we are already able to relay the allocation to + * the root cache and bypass the memcg cache altogether. + * + * There is one exception, though: the SLUB allocator does not create + * large order caches, but rather service large kmallocs directly from + * the page allocator. Therefore, the following sequence when backed by + * the SLUB allocator: + * + * memcg_stop_kmem_account(); + * kmalloc(<large_number>) + * memcg_resume_kmem_account(); + * + * would effectively ignore the fact that we should skip accounting, + * since it will drive us directly to this function without passing + * through the cache selector memcg_kmem_get_cache. Such large + * allocations are extremely rare but can happen, for instance, for the + * cache arrays. We bring this test here. + */ + if (!current->mm || current->memcg_kmem_skip_account) + return true; + memcg = try_get_mem_cgroup_from_mm(current->mm); /* @@ -4171,12 +4208,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, unlock_page_cgroup(pc); /* * even after unlock, we have memcg->res.usage here and this memcg - * will never be freed. + * will never be freed, so it's safe to call css_get(). */ memcg_check_events(memcg, page); if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { mem_cgroup_swap_statistics(memcg, true); - mem_cgroup_get(memcg); + css_get(&memcg->css); } /* * Migration does not charge the res_counter for the @@ -4288,7 +4325,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) /* * record memcg information, if swapout && memcg != NULL, - * mem_cgroup_get() was called in uncharge(). + * css_get() was called in uncharge(). */ if (do_swap_account && swapout && memcg) swap_cgroup_record(ent, css_id(&memcg->css)); @@ -4319,7 +4356,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent) if (!mem_cgroup_is_root(memcg)) res_counter_uncharge(&memcg->memsw, PAGE_SIZE); mem_cgroup_swap_statistics(memcg, false); - mem_cgroup_put(memcg); + css_put(&memcg->css); } rcu_read_unlock(); } @@ -4353,11 +4390,14 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, * This function is only called from task migration context now. * It postpones res_counter and refcount handling till the end * of task migration(mem_cgroup_clear_mc()) for performance - * improvement. But we cannot postpone mem_cgroup_get(to) - * because if the process that has been moved to @to does - * swap-in, the refcount of @to might be decreased to 0. + * improvement. But we cannot postpone css_get(to) because if + * the process that has been moved to @to does swap-in, the + * refcount of @to might be decreased to 0. + * + * We are in attach() phase, so the cgroup is guaranteed to be + * alive, so we can just call css_get(). */ - mem_cgroup_get(to); + css_get(&to->css); return 0; } return -EINVAL; @@ -5136,14 +5176,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val) * starts accounting before all call sites are patched */ memcg_kmem_set_active(memcg); - - /* - * kmem charges can outlive the cgroup. In the case of slab - * pages, for instance, a page contain objects from various - * processes, so it is unfeasible to migrate them away. We - * need to reference count the memcg because of that. - */ - mem_cgroup_get(memcg); } else ret = res_counter_set_limit(&memcg->kmem, val); out: @@ -5176,16 +5208,16 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg) goto out; /* - * destroy(), called if we fail, will issue static_key_slow_inc() and - * mem_cgroup_put() if kmem is enabled. We have to either call them - * unconditionally, or clear the KMEM_ACTIVE flag. I personally find - * this more consistent, since it always leads to the same destroy path + * __mem_cgroup_free() will issue static_key_slow_dec() because this + * memcg is active already. If the later initialization fails then the + * cgroup core triggers the cleanup so we do not have to do it here. */ - mem_cgroup_get(memcg); static_key_slow_inc(&memcg_kmem_enabled_key); mutex_lock(&set_limit_mutex); + memcg_stop_kmem_account(); ret = memcg_update_cache_sizes(memcg); + memcg_resume_kmem_account(); mutex_unlock(&set_limit_mutex); out: return ret; @@ -5864,23 +5896,43 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return mem_cgroup_sockets_init(memcg, ss); } -static void kmem_cgroup_destroy(struct mem_cgroup *memcg) +static void memcg_destroy_kmem(struct mem_cgroup *memcg) { mem_cgroup_sockets_destroy(memcg); +} + +static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) +{ + if (!memcg_kmem_is_active(memcg)) + return; + + /* + * kmem charges can outlive the cgroup. In the case of slab + * pages, for instance, a page contain objects from various + * processes. As we prevent from taking a reference for every + * such allocation we have to be careful when doing uncharge + * (see memcg_uncharge_kmem) and here during offlining. + * + * The idea is that that only the _last_ uncharge which sees + * the dead memcg will drop the last reference. An additional + * reference is taken here before the group is marked dead + * which is then paired with css_put during uncharge resp. here. + * + * Although this might sound strange as this path is called from + * css_offline() when the referencemight have dropped down to 0 + * and shouldn't be incremented anymore (css_tryget would fail) + * we do not have other options because of the kmem allocations + * lifetime. + */ + css_get(&memcg->css); memcg_kmem_mark_dead(memcg); if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) return; - /* - * Charges already down to 0, undo mem_cgroup_get() done in the charge - * path here, being careful not to race with memcg_uncharge_kmem: it is - * possible that the charges went down to 0 between mark_dead and the - * res_counter read, so in that case, we don't need the put - */ if (memcg_kmem_test_and_clear_dead(memcg)) - mem_cgroup_put(memcg); + css_put(&memcg->css); } #else static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) @@ -5888,7 +5940,11 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) return 0; } -static void kmem_cgroup_destroy(struct mem_cgroup *memcg) +static void memcg_destroy_kmem(struct mem_cgroup *memcg) +{ +} + +static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) { } #endif @@ -6058,13 +6114,13 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) mz->on_tree = false; mz->memcg = memcg; } - memcg->info.nodeinfo[node] = pn; + memcg->nodeinfo[node] = pn; return 0; } static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) { - kfree(memcg->info.nodeinfo[node]); + kfree(memcg->nodeinfo[node]); } static struct mem_cgroup *mem_cgroup_alloc(void) @@ -6137,49 +6193,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) vfree(memcg); } - -/* - * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU, - * but in process context. The work_freeing structure is overlaid - * on the rcu_freeing structure, which itself is overlaid on memsw. - */ -static void free_work(struct work_struct *work) -{ - struct mem_cgroup *memcg; - - memcg = container_of(work, struct mem_cgroup, work_freeing); - __mem_cgroup_free(memcg); -} - -static void free_rcu(struct rcu_head *rcu_head) -{ - struct mem_cgroup *memcg; - - memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing); - INIT_WORK(&memcg->work_freeing, free_work); - schedule_work(&memcg->work_freeing); -} - -static void mem_cgroup_get(struct mem_cgroup *memcg) -{ - atomic_inc(&memcg->refcnt); -} - -static void __mem_cgroup_put(struct mem_cgroup *memcg, int count) -{ - if (atomic_sub_and_test(count, &memcg->refcnt)) { - struct mem_cgroup *parent = parent_mem_cgroup(memcg); - call_rcu(&memcg->rcu_freeing, free_rcu); - if (parent) - mem_cgroup_put(parent); - } -} - -static void mem_cgroup_put(struct mem_cgroup *memcg) -{ - __mem_cgroup_put(memcg, 1); -} - /* * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. */ @@ -6239,7 +6252,6 @@ mem_cgroup_css_alloc(struct cgroup *cont) memcg->last_scanned_node = MAX_NUMNODES; INIT_LIST_HEAD(&memcg->oom_notify); - atomic_set(&memcg->refcnt, 1); memcg->move_charge_at_immigrate = 0; mutex_init(&memcg->thresholds_lock); spin_lock_init(&memcg->move_lock); @@ -6275,12 +6287,9 @@ mem_cgroup_css_online(struct cgroup *cont) res_counter_init(&memcg->kmem, &parent->kmem); /* - * We increment refcnt of the parent to ensure that we can - * safely access it on res_counter_charge/uncharge. - * This refcnt will be decremented when freeing this - * mem_cgroup(see mem_cgroup_put). + * No need to take a reference to the parent because cgroup + * core guarantees its existence. */ - mem_cgroup_get(parent); } else { res_counter_init(&memcg->res, NULL); res_counter_init(&memcg->memsw, NULL); @@ -6296,16 +6305,6 @@ mem_cgroup_css_online(struct cgroup *cont) error = memcg_init_kmem(memcg, &mem_cgroup_subsys); mutex_unlock(&memcg_create_mutex); - if (error) { - /* - * We call put now because our (and parent's) refcnts - * are already in place. mem_cgroup_put() will internally - * call __mem_cgroup_free, so return directly - */ - mem_cgroup_put(memcg); - if (parent->use_hierarchy) - mem_cgroup_put(parent); - } return error; } @@ -6317,20 +6316,22 @@ static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) struct mem_cgroup *parent = memcg; while ((parent = parent_mem_cgroup(parent))) - atomic_inc(&parent->dead_count); + mem_cgroup_iter_invalidate(parent); /* * if the root memcg is not hierarchical we have to check it * explicitely. */ if (!root_mem_cgroup->use_hierarchy) - atomic_inc(&root_mem_cgroup->dead_count); + mem_cgroup_iter_invalidate(root_mem_cgroup); } static void mem_cgroup_css_offline(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); + kmem_cgroup_css_offline(memcg); + mem_cgroup_invalidate_reclaim_iterators(memcg); mem_cgroup_reparent_charges(memcg); mem_cgroup_destroy_all_caches(memcg); @@ -6340,9 +6341,8 @@ static void mem_cgroup_css_free(struct cgroup *cont) { struct mem_cgroup *memcg = mem_cgroup_from_cont(cont); - kmem_cgroup_destroy(memcg); - - mem_cgroup_put(memcg); + memcg_destroy_kmem(memcg); + __mem_cgroup_free(memcg); } #ifdef CONFIG_MMU @@ -6651,6 +6651,7 @@ static void __mem_cgroup_clear_mc(void) { struct mem_cgroup *from = mc.from; struct mem_cgroup *to = mc.to; + int i; /* we must uncharge all the leftover precharges from mc.to */ if (mc.precharge) { @@ -6671,7 +6672,9 @@ static void __mem_cgroup_clear_mc(void) if (!mem_cgroup_is_root(mc.from)) res_counter_uncharge(&mc.from->memsw, PAGE_SIZE * mc.moved_swap); - __mem_cgroup_put(mc.from, mc.moved_swap); + + for (i = 0; i < mc.moved_swap; i++) + css_put(&mc.from->css); if (!mem_cgroup_is_root(mc.to)) { /* @@ -6681,7 +6684,7 @@ static void __mem_cgroup_clear_mc(void) res_counter_uncharge(&mc.to->res, PAGE_SIZE * mc.moved_swap); } - /* we've already done mem_cgroup_get(mc.to) */ + /* we've already done css_get(mc.to) */ mc.moved_swap = 0; } memcg_oom_recover(from); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index ceb0c7f..2c13aa7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1410,7 +1410,8 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) /* * Isolate the page, so that it doesn't get reallocated if it - * was free. + * was free. This flag should be kept set until the source page + * is freed and PG_hwpoison on it is set. */ set_migratetype_isolate(p, true); /* @@ -1433,7 +1434,6 @@ static int __get_any_page(struct page *p, unsigned long pfn, int flags) /* Not a free page */ ret = 1; } - unset_migratetype_isolate(p, MIGRATE_MOVABLE); unlock_memory_hotplug(); return ret; } @@ -1494,7 +1494,6 @@ static int soft_offline_huge_page(struct page *page, int flags) atomic_long_add(1 << compound_trans_order(hpage), &num_poisoned_pages); } - /* keep elevated page count for bad page */ return ret; } @@ -1559,7 +1558,7 @@ int soft_offline_page(struct page *page, int flags) atomic_long_inc(&num_poisoned_pages); } } - /* keep elevated page count for bad page */ + unset_migratetype_isolate(page, MIGRATE_MOVABLE); return ret; } @@ -1625,7 +1624,22 @@ static int __soft_offline_page(struct page *page, int flags) if (ret > 0) ret = -EIO; } else { + /* + * After page migration succeeds, the source page can + * be trapped in pagevec and actual freeing is delayed. + * Freeing code works differently based on PG_hwpoison, + * so there's a race. We need to make sure that the + * source page should be freed back to buddy before + * setting PG_hwpoison. + */ + if (!is_free_buddy_page(page)) + lru_add_drain_all(); + if (!is_free_buddy_page(page)) + drain_all_pages(); SetPageHWPoison(page); + if (!is_free_buddy_page(page)) + pr_info("soft offline: %#lx: page leaked\n", + pfn); atomic_long_inc(&num_poisoned_pages); } } else { diff --git a/mm/memory.c b/mm/memory.c index 95d0cce..1ce2e2a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -82,7 +82,6 @@ EXPORT_SYMBOL(max_mapnr); EXPORT_SYMBOL(mem_map); #endif -unsigned long num_physpages; /* * A number of key systems in x86 including ioremap() rely on the assumption * that high_memory defines the upper bound on direct map memory, then end @@ -92,7 +91,6 @@ unsigned long num_physpages; */ void * high_memory; -EXPORT_SYMBOL(num_physpages); EXPORT_SYMBOL(high_memory); /* @@ -1101,6 +1099,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb, spinlock_t *ptl; pte_t *start_pte; pte_t *pte; + unsigned long range_start = addr; again: init_rss_vec(rss); @@ -1151,7 +1150,7 @@ again: if (pte_dirty(ptent)) set_page_dirty(page); if (pte_young(ptent) && - likely(!VM_SequentialReadHint(vma))) + likely(!(vma->vm_flags & VM_SEQ_READ))) mark_page_accessed(page); rss[MM_FILEPAGES]--; } @@ -1206,12 +1205,14 @@ again: force_flush = 0; #ifdef HAVE_GENERIC_MMU_GATHER - tlb->start = addr; - tlb->end = end; + tlb->start = range_start; + tlb->end = addr; #endif tlb_flush_mmu(tlb); - if (addr != end) + if (addr != end) { + range_start = addr; goto again; + } } return addr; @@ -2904,7 +2905,7 @@ static inline void unmap_mapping_range_tree(struct rb_root *root, details->first_index, details->last_index) { vba = vma->vm_pgoff; - vea = vba + ((vma->vm_end - vma->vm_start) >> PAGE_SHIFT) - 1; + vea = vba + vma_pages(vma) - 1; /* Assume for now that PAGE_CACHE_SHIFT == PAGE_SHIFT */ zba = details->first_index; if (zba < vba) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 1ad92b4..ca1dd3a 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -75,7 +75,7 @@ static struct resource *register_memory_resource(u64 start, u64 size) res->end = start + size - 1; res->flags = IORESOURCE_MEM | IORESOURCE_BUSY; if (request_resource(&iomem_resource, res) < 0) { - printk("System RAM resource %pR cannot be added\n", res); + pr_debug("System RAM resource %pR cannot be added\n", res); kfree(res); res = NULL; } @@ -101,12 +101,9 @@ void get_page_bootmem(unsigned long info, struct page *page, atomic_inc(&page->_count); } -/* reference to __meminit __free_pages_bootmem is valid - * so use __ref to tell modpost not to generate a warning */ -void __ref put_page_bootmem(struct page *page) +void put_page_bootmem(struct page *page) { unsigned long type; - static DEFINE_MUTEX(ppb_lock); type = (unsigned long) page->lru.next; BUG_ON(type < MEMORY_HOTPLUG_MIN_BOOTMEM_TYPE || @@ -116,17 +113,8 @@ void __ref put_page_bootmem(struct page *page) ClearPagePrivate(page); set_page_private(page, 0); INIT_LIST_HEAD(&page->lru); - - /* - * Please refer to comment for __free_pages_bootmem() - * for why we serialize here. - */ - mutex_lock(&ppb_lock); - __free_pages_bootmem(page, 0); - mutex_unlock(&ppb_lock); - totalram_pages++; + free_reserved_page(page); } - } #ifdef CONFIG_HAVE_BOOTMEM_INFO_NODE @@ -220,13 +208,13 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat) pfn = pgdat->node_start_pfn; end_pfn = pgdat_end_pfn(pgdat); - /* register_section info */ + /* register section info */ for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { /* * Some platforms can assign the same pfn to multiple nodes - on * node0 as well as nodeN. To avoid registering a pfn against * multiple nodes we check that this pfn does not already - * reside in some other node. + * reside in some other nodes. */ if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node)) register_page_bootmem_info_section(pfn); @@ -309,7 +297,7 @@ static int __meminit move_pfn_range_left(struct zone *z1, struct zone *z2, /* can't move pfns which are higher than @z2 */ if (end_pfn > zone_end_pfn(z2)) goto out_fail; - /* the move out part mast at the left most of @z2 */ + /* the move out part must be at the left most of @z2 */ if (start_pfn > z2->zone_start_pfn) goto out_fail; /* must included/overlap */ @@ -775,29 +763,18 @@ EXPORT_SYMBOL_GPL(restore_online_page_callback); void __online_page_set_limits(struct page *page) { - unsigned long pfn = page_to_pfn(page); - - if (pfn >= num_physpages) - num_physpages = pfn + 1; } EXPORT_SYMBOL_GPL(__online_page_set_limits); void __online_page_increment_counters(struct page *page) { - totalram_pages++; - -#ifdef CONFIG_HIGHMEM - if (PageHighMem(page)) - totalhigh_pages++; -#endif + adjust_managed_page_count(page, 1); } EXPORT_SYMBOL_GPL(__online_page_increment_counters); void __online_page_free(struct page *page) { - ClearPageReserved(page); - init_page_count(page); - __free_page(page); + __free_reserved_page(page); } EXPORT_SYMBOL_GPL(__online_page_free); @@ -918,6 +895,7 @@ static void node_states_set_node(int node, struct memory_notify *arg) int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_type) { + unsigned long flags; unsigned long onlined_pages = 0; struct zone *zone; int need_zonelists_rebuild = 0; @@ -936,19 +914,19 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) && !can_online_high_movable(zone)) { unlock_memory_hotplug(); - return -1; + return -EINVAL; } if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) { if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) { unlock_memory_hotplug(); - return -1; + return -EINVAL; } } if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) { if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) { unlock_memory_hotplug(); - return -1; + return -EINVAL; } } @@ -994,9 +972,12 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ return ret; } - zone->managed_pages += onlined_pages; zone->present_pages += onlined_pages; + + pgdat_resize_lock(zone->zone_pgdat, &flags); zone->zone_pgdat->node_present_pages += onlined_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); + if (onlined_pages) { node_states_set_node(zone_to_nid(zone), &arg); if (need_zonelists_rebuild) @@ -1487,6 +1468,7 @@ static int __ref __offline_pages(unsigned long start_pfn, unsigned long pfn, nr_pages, expire; long offlined_pages; int ret, drain, retry_max, node; + unsigned long flags; struct zone *zone; struct memory_notify arg; @@ -1578,10 +1560,12 @@ repeat: /* reset pagetype flags and makes migrate type to be MOVABLE */ undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE); /* removal success */ - zone->managed_pages -= offlined_pages; + adjust_managed_page_count(pfn_to_page(start_pfn), -offlined_pages); zone->present_pages -= offlined_pages; + + pgdat_resize_lock(zone->zone_pgdat, &flags); zone->zone_pgdat->node_present_pages -= offlined_pages; - totalram_pages -= offlined_pages; + pgdat_resize_unlock(zone->zone_pgdat, &flags); init_per_zone_wmark_min(); @@ -1621,6 +1605,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) { return __offline_pages(start_pfn, start_pfn + nr_pages, 120 * HZ); } +#endif /* CONFIG_MEMORY_HOTREMOVE */ /** * walk_memory_range - walks through all mem sections in [start_pfn, end_pfn) @@ -1634,7 +1619,7 @@ int offline_pages(unsigned long start_pfn, unsigned long nr_pages) * * Returns the return value of func. */ -static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, +int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)) { struct memory_block *mem = NULL; @@ -1671,24 +1656,7 @@ static int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, return 0; } -/** - * offline_memory_block_cb - callback function for offlining memory block - * @mem: the memory block to be offlined - * @arg: buffer to hold error msg - * - * Always return 0, and put the error msg in arg if any. - */ -static int offline_memory_block_cb(struct memory_block *mem, void *arg) -{ - int *ret = arg; - int error = offline_memory_block(mem); - - if (error != 0 && *ret == 0) - *ret = error; - - return 0; -} - +#ifdef CONFIG_MEMORY_HOTREMOVE static int is_memblock_offlined_cb(struct memory_block *mem, void *arg) { int ret = !is_memblock_offlined(mem); @@ -1814,54 +1782,22 @@ void try_offline_node(int nid) } EXPORT_SYMBOL(try_offline_node); -int __ref remove_memory(int nid, u64 start, u64 size) +void __ref remove_memory(int nid, u64 start, u64 size) { - unsigned long start_pfn, end_pfn; - int ret = 0; - int retry = 1; - - start_pfn = PFN_DOWN(start); - end_pfn = PFN_UP(start + size - 1); - - /* - * When CONFIG_MEMCG is on, one memory block may be used by other - * blocks to store page cgroup when onlining pages. But we don't know - * in what order pages are onlined. So we iterate twice to offline - * memory: - * 1st iterate: offline every non primary memory block. - * 2nd iterate: offline primary (i.e. first added) memory block. - */ -repeat: - walk_memory_range(start_pfn, end_pfn, &ret, - offline_memory_block_cb); - if (ret) { - if (!retry) - return ret; - - retry = 0; - ret = 0; - goto repeat; - } + int ret; lock_memory_hotplug(); /* - * we have offlined all memory blocks like this: - * 1. lock memory hotplug - * 2. offline a memory block - * 3. unlock memory hotplug - * - * repeat step1-3 to offline the memory block. All memory blocks - * must be offlined before removing memory. But we don't hold the - * lock in the whole operation. So we should check whether all - * memory blocks are offlined. + * All memory blocks must be offlined before removing memory. Check + * whether all memory blocks in question are offline and trigger a BUG() + * if this is not the case. */ - - ret = walk_memory_range(start_pfn, end_pfn, NULL, + ret = walk_memory_range(PFN_DOWN(start), PFN_UP(start + size - 1), NULL, is_memblock_offlined_cb); if (ret) { unlock_memory_hotplug(); - return ret; + BUG(); } /* remove memmap entry */ @@ -1872,17 +1808,6 @@ repeat: try_offline_node(nid); unlock_memory_hotplug(); - - return 0; } -#else -int offline_pages(unsigned long start_pfn, unsigned long nr_pages) -{ - return -EINVAL; -} -int remove_memory(int nid, u64 start, u64 size) -{ - return -EINVAL; -} -#endif /* CONFIG_MEMORY_HOTREMOVE */ EXPORT_SYMBOL_GPL(remove_memory); +#endif /* CONFIG_MEMORY_HOTREMOVE */ diff --git a/mm/mm_init.c b/mm/mm_init.c index c280a02..633c088 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -9,6 +9,8 @@ #include <linux/init.h> #include <linux/kobject.h> #include <linux/export.h> +#include <linux/memory.h> +#include <linux/notifier.h> #include "internal.h" #ifdef CONFIG_DEBUG_MEMORY_INIT @@ -147,6 +149,51 @@ early_param("mminit_loglevel", set_mminit_loglevel); struct kobject *mm_kobj; EXPORT_SYMBOL_GPL(mm_kobj); +#ifdef CONFIG_SMP +s32 vm_committed_as_batch = 32; + +static void __meminit mm_compute_batch(void) +{ + u64 memsized_batch; + s32 nr = num_present_cpus(); + s32 batch = max_t(s32, nr*2, 32); + + /* batch size set to 0.4% of (total memory/#cpus), or max int32 */ + memsized_batch = min_t(u64, (totalram_pages/nr)/256, 0x7fffffff); + + vm_committed_as_batch = max_t(s32, memsized_batch, batch); +} + +static int __meminit mm_compute_batch_notifier(struct notifier_block *self, + unsigned long action, void *arg) +{ + switch (action) { + case MEM_ONLINE: + case MEM_OFFLINE: + mm_compute_batch(); + default: + break; + } + return NOTIFY_OK; +} + +static struct notifier_block compute_batch_nb __meminitdata = { + .notifier_call = mm_compute_batch_notifier, + .priority = IPC_CALLBACK_PRI, /* use lowest priority */ +}; + +static int __init mm_compute_batch_init(void) +{ + mm_compute_batch(); + register_hotmemory_notifier(&compute_batch_nb); + + return 0; +} + +__initcall(mm_compute_batch_init); + +#endif + static int __init mm_sysfs_init(void) { mm_kobj = kobject_create_and_add("mm", kernel_kobj); @@ -955,7 +955,7 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, if (is_mergeable_vma(vma, file, vm_flags) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; - vm_pglen = (vma->vm_end - vma->vm_start) >> PAGE_SHIFT; + vm_pglen = vma_pages(vma); if (vma->vm_pgoff + vm_pglen == vm_pgoff) return 1; } @@ -1358,18 +1358,19 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, if (!(flags & MAP_ANONYMOUS)) { audit_mmap_fd(fd, flags); - if (unlikely(flags & MAP_HUGETLB)) - return -EINVAL; file = fget(fd); if (!file) goto out; if (is_file_hugepages(file)) len = ALIGN(len, huge_page_size(hstate_file(file))); + retval = -EINVAL; + if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file))) + goto out_fput; } else if (flags & MAP_HUGETLB) { struct user_struct *user = NULL; - struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & - SHM_HUGE_MASK); + struct hstate *hs; + hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK); if (!hs) return -EINVAL; @@ -1391,6 +1392,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len, flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE); retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff); +out_fput: if (file) fput(file); out: @@ -1876,15 +1878,6 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, } #endif -void arch_unmap_area(struct mm_struct *mm, unsigned long addr) -{ - /* - * Is this a new hole at the lowest possible address? - */ - if (addr >= TASK_UNMAPPED_BASE && addr < mm->free_area_cache) - mm->free_area_cache = addr; -} - /* * This mmap-allocator allocates new areas top-down from below the * stack's low limit (the base): @@ -1941,19 +1934,6 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, } #endif -void arch_unmap_area_topdown(struct mm_struct *mm, unsigned long addr) -{ - /* - * Is this a new hole at the highest possible address? - */ - if (addr > mm->free_area_cache) - mm->free_area_cache = addr; - - /* dont allow allocations above current base */ - if (mm->free_area_cache > mm->mmap_base) - mm->free_area_cache = mm->mmap_base; -} - unsigned long get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) @@ -2374,7 +2354,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, { struct vm_area_struct **insertion_point; struct vm_area_struct *tail_vma = NULL; - unsigned long addr; insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; @@ -2391,11 +2370,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma, } else mm->highest_vm_end = prev ? prev->vm_end : 0; tail_vma->vm_next = NULL; - if (mm->unmap_area == arch_unmap_area) - addr = prev ? prev->vm_end : mm->mmap_base; - else - addr = vma ? vma->vm_start : mm->mmap_base; - mm->unmap_area(mm, addr); mm->mmap_cache = NULL; /* Kill the cache. */ } diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 6725ff1..93e6089 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -315,7 +315,7 @@ void mmu_notifier_unregister(struct mmu_notifier *mn, struct mm_struct *mm) /* * Wait for any running method to finish, of course including - * ->release if it was run by mmu_notifier_relase instead of us. + * ->release if it was run by mmu_notifier_release instead of us. */ synchronize_srcu(&srcu); diff --git a/mm/mremap.c b/mm/mremap.c index 463a257..457d34e 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -126,7 +126,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd, continue; pte = ptep_get_and_clear(mm, old_addr, old_pte); pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr); - set_pte_at(mm, new_addr, new_pte, pte); + set_pte_at(mm, new_addr, new_pte, pte_mksoft_dirty(pte)); } arch_leave_lazy_mmu_mode(); @@ -456,13 +456,14 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, unsigned long charged = 0; bool locked = false; - down_write(¤t->mm->mmap_sem); - if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE)) - goto out; + return ret; + + if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE)) + return ret; if (addr & ~PAGE_MASK) - goto out; + return ret; old_len = PAGE_ALIGN(old_len); new_len = PAGE_ALIGN(new_len); @@ -473,12 +474,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, * a zero new-len is nonsensical. */ if (!new_len) - goto out; + return ret; + + down_write(¤t->mm->mmap_sem); if (flags & MREMAP_FIXED) { - if (flags & MREMAP_MAYMOVE) - ret = mremap_to(addr, old_len, new_addr, new_len, - &locked); + ret = mremap_to(addr, old_len, new_addr, new_len, + &locked); goto out; } diff --git a/mm/nobootmem.c b/mm/nobootmem.c index bdd3fa2..61107cf 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -137,20 +137,25 @@ static unsigned long __init free_low_memory_core_early(void) return count; } -static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) +static int reset_managed_pages_done __initdata; + +static inline void __init reset_node_managed_pages(pg_data_t *pgdat) { struct zone *z; - /* - * In free_area_init_core(), highmem zone's managed_pages is set to - * present_pages, and bootmem allocator doesn't allocate from highmem - * zones. So there's no need to recalculate managed_pages because all - * highmem pages will be managed by the buddy system. Here highmem - * zone also includes highmem movable zone. - */ + if (reset_managed_pages_done) + return; for (z = pgdat->node_zones; z < pgdat->node_zones + MAX_NR_ZONES; z++) - if (!is_highmem(z)) - z->managed_pages = 0; + z->managed_pages = 0; +} + +void __init reset_all_zones_managed_pages(void) +{ + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) + reset_node_managed_pages(pgdat); + reset_managed_pages_done = 1; } /** @@ -160,17 +165,19 @@ static void reset_node_lowmem_managed_pages(pg_data_t *pgdat) */ unsigned long __init free_all_bootmem(void) { - struct pglist_data *pgdat; + unsigned long pages; - for_each_online_pgdat(pgdat) - reset_node_lowmem_managed_pages(pgdat); + reset_all_zones_managed_pages(); /* * We need to use MAX_NUMNODES instead of NODE_DATA(0)->node_id * because in some case like Node0 doesn't have RAM installed * low ram will be on Node1 */ - return free_low_memory_core_early(); + pages = free_low_memory_core_early(); + totalram_pages += pages; + + return pages; } /** @@ -56,7 +56,6 @@ void *high_memory; struct page *mem_map; unsigned long max_mapnr; -unsigned long num_physpages; unsigned long highest_memmap_pfn; struct percpu_counter vm_committed_as; int sysctl_overcommit_memory = OVERCOMMIT_GUESS; /* heuristic overcommit */ @@ -85,7 +84,6 @@ unsigned long vm_memory_committed(void) EXPORT_SYMBOL_GPL(vm_memory_committed); EXPORT_SYMBOL(mem_map); -EXPORT_SYMBOL(num_physpages); /* list of mapped, potentially shareable regions */ static struct kmem_cache *vm_region_jar; @@ -282,6 +280,10 @@ EXPORT_SYMBOL(vmalloc_to_pfn); long vread(char *buf, char *addr, unsigned long count) { + /* Don't allow overflow */ + if ((unsigned long) buf + count < count) + count = -(unsigned long) buf; + memcpy(buf, addr, count); return count; } @@ -1869,10 +1871,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; } -void arch_unmap_area(struct mm_struct *mm, unsigned long addr) -{ -} - void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c3edb62..b100255 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -61,10 +61,14 @@ #include <linux/hugetlb.h> #include <linux/sched/rt.h> +#include <asm/sections.h> #include <asm/tlbflush.h> #include <asm/div64.h> #include "internal.h" +/* prevent >1 _updater_ of zone percpu pageset ->high and ->batch fields */ +static DEFINE_MUTEX(pcp_batch_high_lock); + #ifdef CONFIG_USE_PERCPU_NUMA_NODE_ID DEFINE_PER_CPU(int, numa_node); EXPORT_PER_CPU_SYMBOL(numa_node); @@ -100,6 +104,9 @@ nodemask_t node_states[NR_NODE_STATES] __read_mostly = { }; EXPORT_SYMBOL(node_states); +/* Protect totalram_pages and zone->managed_pages */ +static DEFINE_SPINLOCK(managed_page_count_lock); + unsigned long totalram_pages __read_mostly; unsigned long totalreserve_pages __read_mostly; /* @@ -197,6 +204,7 @@ static char * const zone_names[MAX_NR_ZONES] = { }; int min_free_kbytes = 1024; +int user_min_free_kbytes; static unsigned long __meminitdata nr_kernel_pages; static unsigned long __meminitdata nr_all_pages; @@ -739,14 +747,7 @@ static void __free_pages_ok(struct page *page, unsigned int order) local_irq_restore(flags); } -/* - * Read access to zone->managed_pages is safe because it's unsigned long, - * but we still need to serialize writers. Currently all callers of - * __free_pages_bootmem() except put_page_bootmem() should only be used - * at boot time. So for shorter boot time, we shift the burden to - * put_page_bootmem() to serialize writers. - */ -void __meminit __free_pages_bootmem(struct page *page, unsigned int order) +void __init __free_pages_bootmem(struct page *page, unsigned int order) { unsigned int nr_pages = 1 << order; unsigned int loop; @@ -781,11 +782,7 @@ void __init init_cma_reserved_pageblock(struct page *page) set_page_refcounted(page); set_pageblock_migratetype(page, MIGRATE_CMA); __free_pages(page, pageblock_order); - totalram_pages += pageblock_nr_pages; -#ifdef CONFIG_HIGHMEM - if (PageHighMem(page)) - totalhigh_pages += pageblock_nr_pages; -#endif + adjust_managed_page_count(page, pageblock_nr_pages); } #endif @@ -1050,7 +1047,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype) * MIGRATE_CMA areas. */ if (!is_migrate_cma(migratetype) && - (unlikely(current_order >= pageblock_order / 2) || + (current_order >= pageblock_order / 2 || start_migratetype == MIGRATE_RECLAIMABLE || page_group_by_mobility_disabled)) { int pages; @@ -1179,10 +1176,12 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { unsigned long flags; int to_drain; + unsigned long batch; local_irq_save(flags); - if (pcp->count >= pcp->batch) - to_drain = pcp->batch; + batch = ACCESS_ONCE(pcp->batch); + if (pcp->count >= batch) + to_drain = batch; else to_drain = pcp->count; if (to_drain > 0) { @@ -1350,8 +1349,9 @@ void free_hot_cold_page(struct page *page, int cold) list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; if (pcp->count >= pcp->high) { - free_pcppages_bulk(zone, pcp->batch, pcp); - pcp->count -= pcp->batch; + unsigned long batch = ACCESS_ONCE(pcp->batch); + free_pcppages_bulk(zone, batch, pcp); + pcp->count -= batch; } out: @@ -2839,7 +2839,7 @@ EXPORT_SYMBOL(free_pages_exact); * nr_free_zone_pages() counts the number of counts pages which are beyond the * high watermark within all zones at or below a given zone index. For each * zone, the number of pages is calculated as: - * present_pages - high_pages + * managed_pages - high_pages */ static unsigned long nr_free_zone_pages(int offset) { @@ -2906,9 +2906,13 @@ EXPORT_SYMBOL(si_meminfo); #ifdef CONFIG_NUMA void si_meminfo_node(struct sysinfo *val, int nid) { + int zone_type; /* needs to be signed */ + unsigned long managed_pages = 0; pg_data_t *pgdat = NODE_DATA(nid); - val->totalram = pgdat->node_present_pages; + for (zone_type = 0; zone_type < MAX_NR_ZONES; zone_type++) + managed_pages += pgdat->node_zones[zone_type].managed_pages; + val->totalram = managed_pages; val->freeram = node_page_state(nid, NR_FREE_PAGES); #ifdef CONFIG_HIGHMEM val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].managed_pages; @@ -3150,12 +3154,10 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref) * Add all populated zones of a node to the zonelist. */ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, - int nr_zones, enum zone_type zone_type) + int nr_zones) { struct zone *zone; - - BUG_ON(zone_type >= MAX_NR_ZONES); - zone_type++; + enum zone_type zone_type = MAX_NR_ZONES; do { zone_type--; @@ -3165,8 +3167,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist, &zonelist->_zonerefs[nr_zones++]); check_highest_zone(zone_type); } - } while (zone_type); + return nr_zones; } @@ -3250,18 +3252,25 @@ int numa_zonelist_order_handler(ctl_table *table, int write, static DEFINE_MUTEX(zl_order_mutex); mutex_lock(&zl_order_mutex); - if (write) - strcpy(saved_string, (char*)table->data); + if (write) { + if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) { + ret = -EINVAL; + goto out; + } + strcpy(saved_string, (char *)table->data); + } ret = proc_dostring(table, write, buffer, length, ppos); if (ret) goto out; if (write) { int oldval = user_zonelist_order; - if (__parse_numa_zonelist_order((char*)table->data)) { + + ret = __parse_numa_zonelist_order((char *)table->data); + if (ret) { /* * bogus value. restore saved string */ - strncpy((char*)table->data, saved_string, + strncpy((char *)table->data, saved_string, NUMA_ZONELIST_ORDER_LEN); user_zonelist_order = oldval; } else if (oldval != user_zonelist_order) { @@ -3353,8 +3362,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node) zonelist = &pgdat->node_zonelists[0]; for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++) ; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; } @@ -3368,7 +3376,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat) struct zonelist *zonelist; zonelist = &pgdat->node_zonelists[1]; - j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); + j = build_zonelists_node(pgdat, zonelist, 0); zonelist->_zonerefs[j].zone = NULL; zonelist->_zonerefs[j].zone_idx = 0; } @@ -3425,8 +3433,8 @@ static int default_zonelist_order(void) z = &NODE_DATA(nid)->node_zones[zone_type]; if (populated_zone(z)) { if (zone_type < ZONE_NORMAL) - low_kmem_size += z->present_pages; - total_size += z->present_pages; + low_kmem_size += z->managed_pages; + total_size += z->managed_pages; } else if (zone_type == ZONE_NORMAL) { /* * If any node has only lowmem, then node order @@ -3576,7 +3584,7 @@ static void build_zonelists(pg_data_t *pgdat) local_node = pgdat->node_id; zonelist = &pgdat->node_zonelists[0]; - j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1); + j = build_zonelists_node(pgdat, zonelist, 0); /* * Now we build the zonelist so that it contains the zones @@ -3589,14 +3597,12 @@ static void build_zonelists(pg_data_t *pgdat) for (node = local_node + 1; node < MAX_NUMNODES; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); } for (node = 0; node < local_node; node++) { if (!node_online(node)) continue; - j = build_zonelists_node(NODE_DATA(node), zonelist, j, - MAX_NR_ZONES - 1); + j = build_zonelists_node(NODE_DATA(node), zonelist, j); } zonelist->_zonerefs[j].zone = NULL; @@ -3705,12 +3711,12 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone) mminit_verify_zonelist(); cpuset_init_current_mems_allowed(); } else { - /* we have to stop all cpus to guarantee there is no user - of zonelist */ #ifdef CONFIG_MEMORY_HOTPLUG if (zone) setup_zone_pageset(zone); #endif + /* we have to stop all cpus to guarantee there is no user + of zonelist */ stop_machine(__build_all_zonelists, pgdat, NULL); /* cpuset refresh routine should be here */ } @@ -4032,7 +4038,40 @@ static int __meminit zone_batchsize(struct zone *zone) #endif } -static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +/* + * pcp->high and pcp->batch values are related and dependent on one another: + * ->batch must never be higher then ->high. + * The following function updates them in a safe manner without read side + * locking. + * + * Any new users of pcp->batch and pcp->high should ensure they can cope with + * those fields changing asynchronously (acording the the above rule). + * + * mutex_is_locked(&pcp_batch_high_lock) required when calling this function + * outside of boot time (or some other assurance that no concurrent updaters + * exist). + */ +static void pageset_update(struct per_cpu_pages *pcp, unsigned long high, + unsigned long batch) +{ + /* start with a fail safe value for batch */ + pcp->batch = 1; + smp_wmb(); + + /* Update high, then batch, in order */ + pcp->high = high; + smp_wmb(); + + pcp->batch = batch; +} + +/* a companion to pageset_set_high() */ +static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch) +{ + pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch)); +} + +static void pageset_init(struct per_cpu_pageset *p) { struct per_cpu_pages *pcp; int migratetype; @@ -4041,45 +4080,55 @@ static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) pcp = &p->pcp; pcp->count = 0; - pcp->high = 6 * batch; - pcp->batch = max(1UL, 1 * batch); for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++) INIT_LIST_HEAD(&pcp->lists[migratetype]); } +static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch) +{ + pageset_init(p); + pageset_set_batch(p, batch); +} + /* - * setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist + * pageset_set_high() sets the high water mark for hot per_cpu_pagelist * to the value high for the pageset p. */ - -static void setup_pagelist_highmark(struct per_cpu_pageset *p, +static void pageset_set_high(struct per_cpu_pageset *p, unsigned long high) { - struct per_cpu_pages *pcp; + unsigned long batch = max(1UL, high / 4); + if ((high / 4) > (PAGE_SHIFT * 8)) + batch = PAGE_SHIFT * 8; - pcp = &p->pcp; - pcp->high = high; - pcp->batch = max(1UL, high/4); - if ((high/4) > (PAGE_SHIFT * 8)) - pcp->batch = PAGE_SHIFT * 8; + pageset_update(&p->pcp, high, batch); } -static void __meminit setup_zone_pageset(struct zone *zone) +static void __meminit pageset_set_high_and_batch(struct zone *zone, + struct per_cpu_pageset *pcp) { - int cpu; - - zone->pageset = alloc_percpu(struct per_cpu_pageset); + if (percpu_pagelist_fraction) + pageset_set_high(pcp, + (zone->managed_pages / + percpu_pagelist_fraction)); + else + pageset_set_batch(pcp, zone_batchsize(zone)); +} - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); +static void __meminit zone_pageset_init(struct zone *zone, int cpu) +{ + struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu); - setup_pageset(pcp, zone_batchsize(zone)); + pageset_init(pcp); + pageset_set_high_and_batch(zone, pcp); +} - if (percpu_pagelist_fraction) - setup_pagelist_highmark(pcp, - (zone->managed_pages / - percpu_pagelist_fraction)); - } +static void __meminit setup_zone_pageset(struct zone *zone) +{ + int cpu; + zone->pageset = alloc_percpu(struct per_cpu_pageset); + for_each_possible_cpu(cpu) + zone_pageset_init(zone, cpu); } /* @@ -4368,13 +4417,13 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid, */ static unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *ignored) { - unsigned long node_start_pfn, node_end_pfn; unsigned long zone_start_pfn, zone_end_pfn; - /* Get the start and end of the node and zone */ - get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); + /* Get the start and end of the zone */ zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type]; zone_end_pfn = arch_zone_highest_possible_pfn[zone_type]; adjust_zone_range_for_zone_movable(nid, zone_type, @@ -4429,14 +4478,14 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn, /* Return the number of page frames in holes in a zone on a node */ static unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *ignored) { unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type]; unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type]; - unsigned long node_start_pfn, node_end_pfn; unsigned long zone_start_pfn, zone_end_pfn; - get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn); zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high); zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high); @@ -4449,6 +4498,8 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid, #else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *zones_size) { return zones_size[zone_type]; @@ -4456,6 +4507,8 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid, static inline unsigned long __meminit zone_absent_pages_in_node(int nid, unsigned long zone_type, + unsigned long node_start_pfn, + unsigned long node_end_pfn, unsigned long *zholes_size) { if (!zholes_size) @@ -4467,21 +4520,27 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid, #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ static void __meminit calculate_node_totalpages(struct pglist_data *pgdat, - unsigned long *zones_size, unsigned long *zholes_size) + unsigned long node_start_pfn, + unsigned long node_end_pfn, + unsigned long *zones_size, + unsigned long *zholes_size) { unsigned long realtotalpages, totalpages = 0; enum zone_type i; for (i = 0; i < MAX_NR_ZONES; i++) totalpages += zone_spanned_pages_in_node(pgdat->node_id, i, - zones_size); + node_start_pfn, + node_end_pfn, + zones_size); pgdat->node_spanned_pages = totalpages; realtotalpages = totalpages; for (i = 0; i < MAX_NR_ZONES; i++) realtotalpages -= zone_absent_pages_in_node(pgdat->node_id, i, - zholes_size); + node_start_pfn, node_end_pfn, + zholes_size); pgdat->node_present_pages = realtotalpages; printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages); @@ -4590,6 +4649,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, * NOTE: pgdat should get zeroed by caller. */ static void __paginginit free_area_init_core(struct pglist_data *pgdat, + unsigned long node_start_pfn, unsigned long node_end_pfn, unsigned long *zones_size, unsigned long *zholes_size) { enum zone_type j; @@ -4611,8 +4671,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat, struct zone *zone = pgdat->node_zones + j; unsigned long size, realsize, freesize, memmap_pages; - size = zone_spanned_pages_in_node(nid, j, zones_size); + size = zone_spanned_pages_in_node(nid, j, node_start_pfn, + node_end_pfn, zones_size); realsize = freesize = size - zone_absent_pages_in_node(nid, j, + node_start_pfn, + node_end_pfn, zholes_size); /* @@ -4726,6 +4789,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { pg_data_t *pgdat = NODE_DATA(nid); + unsigned long start_pfn = 0; + unsigned long end_pfn = 0; /* pg_data_t should be reset to zero when it's allocated */ WARN_ON(pgdat->nr_zones || pgdat->classzone_idx); @@ -4733,7 +4798,11 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, pgdat->node_id = nid; pgdat->node_start_pfn = node_start_pfn; init_zone_allows_reclaim(nid); - calculate_node_totalpages(pgdat, zones_size, zholes_size); +#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP + get_pfn_range_for_nid(nid, &start_pfn, &end_pfn); +#endif + calculate_node_totalpages(pgdat, start_pfn, end_pfn, + zones_size, zholes_size); alloc_node_mem_map(pgdat); #ifdef CONFIG_FLAT_NODE_MEM_MAP @@ -4742,7 +4811,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, (unsigned long)pgdat->node_mem_map); #endif - free_area_init_core(pgdat, zones_size, zholes_size); + free_area_init_core(pgdat, start_pfn, end_pfn, + zones_size, zholes_size); } #ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP @@ -5150,35 +5220,101 @@ early_param("movablecore", cmdline_parse_movablecore); #endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */ -unsigned long free_reserved_area(unsigned long start, unsigned long end, - int poison, char *s) +void adjust_managed_page_count(struct page *page, long count) { - unsigned long pages, pos; + spin_lock(&managed_page_count_lock); + page_zone(page)->managed_pages += count; + totalram_pages += count; +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages += count; +#endif + spin_unlock(&managed_page_count_lock); +} +EXPORT_SYMBOL(adjust_managed_page_count); - pos = start = PAGE_ALIGN(start); - end &= PAGE_MASK; - for (pages = 0; pos < end; pos += PAGE_SIZE, pages++) { - if (poison) - memset((void *)pos, poison, PAGE_SIZE); - free_reserved_page(virt_to_page((void *)pos)); +unsigned long free_reserved_area(void *start, void *end, int poison, char *s) +{ + void *pos; + unsigned long pages = 0; + + start = (void *)PAGE_ALIGN((unsigned long)start); + end = (void *)((unsigned long)end & PAGE_MASK); + for (pos = start; pos < end; pos += PAGE_SIZE, pages++) { + if ((unsigned int)poison <= 0xFF) + memset(pos, poison, PAGE_SIZE); + free_reserved_page(virt_to_page(pos)); } if (pages && s) - pr_info("Freeing %s memory: %ldK (%lx - %lx)\n", + pr_info("Freeing %s memory: %ldK (%p - %p)\n", s, pages << (PAGE_SHIFT - 10), start, end); return pages; } +EXPORT_SYMBOL(free_reserved_area); #ifdef CONFIG_HIGHMEM void free_highmem_page(struct page *page) { __free_reserved_page(page); totalram_pages++; + page_zone(page)->managed_pages++; totalhigh_pages++; } #endif + +void __init mem_init_print_info(const char *str) +{ + unsigned long physpages, codesize, datasize, rosize, bss_size; + unsigned long init_code_size, init_data_size; + + physpages = get_num_physpages(); + codesize = _etext - _stext; + datasize = _edata - _sdata; + rosize = __end_rodata - __start_rodata; + bss_size = __bss_stop - __bss_start; + init_data_size = __init_end - __init_begin; + init_code_size = _einittext - _sinittext; + + /* + * Detect special cases and adjust section sizes accordingly: + * 1) .init.* may be embedded into .data sections + * 2) .init.text.* may be out of [__init_begin, __init_end], + * please refer to arch/tile/kernel/vmlinux.lds.S. + * 3) .rodata.* may be embedded into .text or .data sections. + */ +#define adj_init_size(start, end, size, pos, adj) \ + if (start <= pos && pos < end && size > adj) \ + size -= adj; + + adj_init_size(__init_begin, __init_end, init_data_size, + _sinittext, init_code_size); + adj_init_size(_stext, _etext, codesize, _sinittext, init_code_size); + adj_init_size(_sdata, _edata, datasize, __init_begin, init_data_size); + adj_init_size(_stext, _etext, codesize, __start_rodata, rosize); + adj_init_size(_sdata, _edata, datasize, __start_rodata, rosize); + +#undef adj_init_size + + printk("Memory: %luK/%luK available " + "(%luK kernel code, %luK rwdata, %luK rodata, " + "%luK init, %luK bss, %luK reserved" +#ifdef CONFIG_HIGHMEM + ", %luK highmem" +#endif + "%s%s)\n", + nr_free_pages() << (PAGE_SHIFT-10), physpages << (PAGE_SHIFT-10), + codesize >> 10, datasize >> 10, rosize >> 10, + (init_data_size + init_code_size) >> 10, bss_size >> 10, + (physpages - totalram_pages) << (PAGE_SHIFT-10), +#ifdef CONFIG_HIGHMEM + totalhigh_pages << (PAGE_SHIFT-10), +#endif + str ? ", " : "", str ? str : ""); +} + /** * set_dma_reserve - set the specified number of pages reserved in the first zone * @new_dma_reserve: The number of pages to mark reserved @@ -5454,14 +5590,21 @@ static void __meminit setup_per_zone_inactive_ratio(void) int __meminit init_per_zone_wmark_min(void) { unsigned long lowmem_kbytes; + int new_min_free_kbytes; lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10); - - min_free_kbytes = int_sqrt(lowmem_kbytes * 16); - if (min_free_kbytes < 128) - min_free_kbytes = 128; - if (min_free_kbytes > 65536) - min_free_kbytes = 65536; + new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16); + + if (new_min_free_kbytes > user_min_free_kbytes) { + min_free_kbytes = new_min_free_kbytes; + if (min_free_kbytes < 128) + min_free_kbytes = 128; + if (min_free_kbytes > 65536) + min_free_kbytes = 65536; + } else { + pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n", + new_min_free_kbytes, user_min_free_kbytes); + } setup_per_zone_wmarks(); refresh_zone_stat_thresholds(); setup_per_zone_lowmem_reserve(); @@ -5479,8 +5622,10 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { proc_dointvec(table, write, buffer, length, ppos); - if (write) + if (write) { + user_min_free_kbytes = min_free_kbytes; setup_per_zone_wmarks(); + } return 0; } @@ -5540,7 +5685,6 @@ int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write, * cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist * can have before it gets flushed back to buddy allocator. */ - int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, void __user *buffer, size_t *length, loff_t *ppos) { @@ -5551,14 +5695,16 @@ int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write, ret = proc_dointvec_minmax(table, write, buffer, length, ppos); if (!write || (ret < 0)) return ret; + + mutex_lock(&pcp_batch_high_lock); for_each_populated_zone(zone) { - for_each_possible_cpu(cpu) { - unsigned long high; - high = zone->managed_pages / percpu_pagelist_fraction; - setup_pagelist_highmark( - per_cpu_ptr(zone->pageset, cpu), high); - } + unsigned long high; + high = zone->managed_pages / percpu_pagelist_fraction; + for_each_possible_cpu(cpu) + pageset_set_high(per_cpu_ptr(zone->pageset, cpu), + high); } + mutex_unlock(&pcp_batch_high_lock); return 0; } @@ -6047,32 +6193,18 @@ void free_contig_range(unsigned long pfn, unsigned nr_pages) #endif #ifdef CONFIG_MEMORY_HOTPLUG -static int __meminit __zone_pcp_update(void *data) -{ - struct zone *zone = data; - int cpu; - unsigned long batch = zone_batchsize(zone), flags; - - for_each_possible_cpu(cpu) { - struct per_cpu_pageset *pset; - struct per_cpu_pages *pcp; - - pset = per_cpu_ptr(zone->pageset, cpu); - pcp = &pset->pcp; - - local_irq_save(flags); - if (pcp->count > 0) - free_pcppages_bulk(zone, pcp->count, pcp); - drain_zonestat(zone, pset); - setup_pageset(pset, batch); - local_irq_restore(flags); - } - return 0; -} - +/* + * The zone indicated has a new number of managed_pages; batch sizes and percpu + * page high values need to be recalulated. + */ void __meminit zone_pcp_update(struct zone *zone) { - stop_machine(__zone_pcp_update, zone, NULL); + unsigned cpu; + mutex_lock(&pcp_batch_high_lock); + for_each_possible_cpu(cpu) + pageset_set_high_and_batch(zone, + per_cpu_ptr(zone->pageset, cpu)); + mutex_unlock(&pcp_batch_high_lock); } #endif @@ -6142,6 +6274,10 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn) list_del(&page->lru); rmv_page_order(page); zone->free_area[order].nr_free--; +#ifdef CONFIG_HIGHMEM + if (PageHighMem(page)) + totalhigh_pages -= 1 << order; +#endif for (i = 0; i < (1 << order); i++) SetPageReserved((page+i)); pfn += (1 << order); diff --git a/mm/page_io.c b/mm/page_io.c index a8a3ef4..ba05b64 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -21,6 +21,7 @@ #include <linux/writeback.h> #include <linux/frontswap.h> #include <linux/aio.h> +#include <linux/blkdev.h> #include <asm/pgtable.h> static struct bio *get_swap_bio(gfp_t gfp_flags, @@ -80,9 +81,54 @@ void end_swap_bio_read(struct bio *bio, int err) imajor(bio->bi_bdev->bd_inode), iminor(bio->bi_bdev->bd_inode), (unsigned long long)bio->bi_sector); - } else { - SetPageUptodate(page); + goto out; } + + SetPageUptodate(page); + + /* + * There is no guarantee that the page is in swap cache - the software + * suspend code (at least) uses end_swap_bio_read() against a non- + * swapcache page. So we must check PG_swapcache before proceeding with + * this optimization. + */ + if (likely(PageSwapCache(page))) { + struct swap_info_struct *sis; + + sis = page_swap_info(page); + if (sis->flags & SWP_BLKDEV) { + /* + * The swap subsystem performs lazy swap slot freeing, + * expecting that the page will be swapped out again. + * So we can avoid an unnecessary write if the page + * isn't redirtied. + * This is good for real swap storage because we can + * reduce unnecessary I/O and enhance wear-leveling + * if an SSD is used as the as swap device. + * But if in-memory swap device (eg zram) is used, + * this causes a duplicated copy between uncompressed + * data in VM-owned memory and compressed data in + * zram-owned memory. So let's free zram-owned memory + * and make the VM-owned decompressed page *dirty*, + * so the page should be swapped out somewhere again if + * we again wish to reclaim it. + */ + struct gendisk *disk = sis->bdev->bd_disk; + if (disk->fops->swap_slot_free_notify) { + swp_entry_t entry; + unsigned long offset; + + entry.val = page_private(page); + offset = swp_offset(entry); + + SetPageDirty(page); + disk->fops->swap_slot_free_notify(sis->bdev, + offset); + } + } + } + +out: unlock_page(page); bio_put(bio); } diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 0c8323f..e1a6e4f 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -124,7 +124,8 @@ void pmdp_splitting_flush(struct vm_area_struct *vma, unsigned long address, #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT #ifdef CONFIG_TRANSPARENT_HUGEPAGE -void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) +void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, + pgtable_t pgtable) { assert_spin_locked(&mm->page_table_lock); @@ -141,7 +142,7 @@ void pgtable_trans_huge_deposit(struct mm_struct *mm, pgtable_t pgtable) #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW #ifdef CONFIG_TRANSPARENT_HUGEPAGE /* no "address" argument so destroys page coloring of some arch */ -pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm) +pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) { pgtable_t pgtable; @@ -720,7 +720,7 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma, * mapping is already gone, the unmap path will have * set PG_referenced or activated the page. */ - if (likely(!VM_SequentialReadHint(vma))) + if (likely(!(vma->vm_flags & VM_SEQ_READ))) referenced++; } pte_unmap_unlock(pte, ptl); @@ -1093,9 +1093,10 @@ void page_add_new_anon_rmap(struct page *page, else __inc_zone_page_state(page, NR_ANON_TRANSPARENT_HUGEPAGES); __page_set_anon_rmap(page, vma, address, 1); - if (!mlocked_vma_newpage(vma, page)) - lru_cache_add_lru(page, LRU_ACTIVE_ANON); - else + if (!mlocked_vma_newpage(vma, page)) { + SetPageActive(page); + lru_cache_add(page); + } else add_page_to_unevictable_list(page); } @@ -1936,6 +1936,13 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) inode = shmem_get_inode(dir->i_sb, dir, mode, dev, VM_NORESERVE); if (inode) { +#ifdef CONFIG_TMPFS_POSIX_ACL + error = generic_acl_init(inode, dir); + if (error) { + iput(inode); + return error; + } +#endif error = security_inode_init_security(inode, dir, &dentry->d_name, shmem_initxattrs, NULL); @@ -1945,15 +1952,8 @@ shmem_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t dev) return error; } } -#ifdef CONFIG_TMPFS_POSIX_ACL - error = generic_acl_init(inode, dir); - if (error) { - iput(inode); - return error; - } -#else + error = 0; -#endif dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = CURRENT_TIME; d_instantiate(dentry, inode); @@ -565,7 +565,7 @@ static void init_node_lock_keys(int q) if (slab_state < UP) return; - for (i = 1; i < PAGE_SHIFT + MAX_ORDER; i++) { + for (i = 1; i <= KMALLOC_SHIFT_HIGH; i++) { struct kmem_cache_node *n; struct kmem_cache *cache = kmalloc_caches[i]; @@ -1180,6 +1180,12 @@ static int init_cache_node_node(int node) return 0; } +static inline int slabs_tofree(struct kmem_cache *cachep, + struct kmem_cache_node *n) +{ + return (n->free_objects + cachep->num - 1) / cachep->num; +} + static void __cpuinit cpuup_canceled(long cpu) { struct kmem_cache *cachep; @@ -1241,7 +1247,7 @@ free_array_cache: n = cachep->node[node]; if (!n) continue; - drain_freelist(cachep, n, n->free_objects); + drain_freelist(cachep, n, slabs_tofree(cachep, n)); } } @@ -1408,7 +1414,7 @@ static int __meminit drain_cache_node_node(int node) if (!n) continue; - drain_freelist(cachep, n, n->free_objects); + drain_freelist(cachep, n, slabs_tofree(cachep, n)); if (!list_empty(&n->slabs_full) || !list_empty(&n->slabs_partial)) { @@ -2532,7 +2538,7 @@ static int __cache_shrink(struct kmem_cache *cachep) if (!n) continue; - drain_freelist(cachep, n, n->free_objects); + drain_freelist(cachep, n, slabs_tofree(cachep, n)); ret += !list_empty(&n->slabs_full) || !list_empty(&n->slabs_partial); @@ -3338,18 +3344,6 @@ done: return obj; } -/** - * kmem_cache_alloc_node - Allocate an object on the specified node - * @cachep: The cache to allocate from. - * @flags: See kmalloc(). - * @nodeid: node number of the target node. - * @caller: return address of caller, used for debug information - * - * Identical to kmem_cache_alloc but it will allocate memory on the given - * node, which can improve the performance for cpu bound structures. - * - * Fallback to other node is possible if __GFP_THISNODE is not set. - */ static __always_inline void * slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid, unsigned long caller) @@ -3643,6 +3637,17 @@ EXPORT_SYMBOL(kmem_cache_alloc_trace); #endif #ifdef CONFIG_NUMA +/** + * kmem_cache_alloc_node - Allocate an object on the specified node + * @cachep: The cache to allocate from. + * @flags: See kmalloc(). + * @nodeid: node number of the target node. + * + * Identical to kmem_cache_alloc but it will allocate memory on the given + * node, which can improve the performance for cpu bound structures. + * + * Fallback to other node is possible if __GFP_THISNODE is not set. + */ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid) { void *ret = slab_alloc_node(cachep, flags, nodeid, _RET_IP_); @@ -4431,20 +4436,10 @@ static int leaks_show(struct seq_file *m, void *p) return 0; } -static void *s_next(struct seq_file *m, void *p, loff_t *pos) -{ - return seq_list_next(p, &slab_caches, pos); -} - -static void s_stop(struct seq_file *m, void *p) -{ - mutex_unlock(&slab_mutex); -} - static const struct seq_operations slabstats_op = { .start = leaks_start, - .next = s_next, - .stop = s_stop, + .next = slab_next, + .stop = slab_stop, .show = leaks_show, }; @@ -271,3 +271,6 @@ struct kmem_cache_node { #endif }; + +void *slab_next(struct seq_file *m, void *p, loff_t *pos); +void slab_stop(struct seq_file *m, void *p); diff --git a/mm/slab_common.c b/mm/slab_common.c index 2d41450..538bade 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -497,6 +497,13 @@ void __init create_kmalloc_caches(unsigned long flags) #ifdef CONFIG_SLABINFO + +#ifdef CONFIG_SLAB +#define SLABINFO_RIGHTS (S_IWUSR | S_IRUSR) +#else +#define SLABINFO_RIGHTS S_IRUSR +#endif + void print_slabinfo_header(struct seq_file *m) { /* @@ -531,12 +538,12 @@ static void *s_start(struct seq_file *m, loff_t *pos) return seq_list_start(&slab_caches, *pos); } -static void *s_next(struct seq_file *m, void *p, loff_t *pos) +void *slab_next(struct seq_file *m, void *p, loff_t *pos) { return seq_list_next(p, &slab_caches, pos); } -static void s_stop(struct seq_file *m, void *p) +void slab_stop(struct seq_file *m, void *p) { mutex_unlock(&slab_mutex); } @@ -613,8 +620,8 @@ static int s_show(struct seq_file *m, void *p) */ static const struct seq_operations slabinfo_op = { .start = s_start, - .next = s_next, - .stop = s_stop, + .next = slab_next, + .stop = slab_stop, .show = s_show, }; @@ -633,7 +640,8 @@ static const struct file_operations proc_slabinfo_operations = { static int __init slab_proc_init(void) { - proc_create("slabinfo", S_IRUSR, NULL, &proc_slabinfo_operations); + proc_create("slabinfo", SLABINFO_RIGHTS, NULL, + &proc_slabinfo_operations); return 0; } module_init(slab_proc_init); @@ -122,7 +122,7 @@ static inline void clear_slob_page_free(struct page *sp) } #define SLOB_UNIT sizeof(slob_t) -#define SLOB_UNITS(size) (((size) + SLOB_UNIT - 1)/SLOB_UNIT) +#define SLOB_UNITS(size) DIV_ROUND_UP(size, SLOB_UNIT) /* * struct slob_rcu is inserted at the tail of allocated slob blocks, which @@ -554,7 +554,7 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node) flags, node); } - if (c->ctor) + if (b && c->ctor) c->ctor(b); kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); @@ -123,6 +123,15 @@ static inline int kmem_cache_debug(struct kmem_cache *s) #endif } +static inline bool kmem_cache_has_cpu_partial(struct kmem_cache *s) +{ +#ifdef CONFIG_SLUB_CPU_PARTIAL + return !kmem_cache_debug(s); +#else + return false; +#endif +} + /* * Issues still to be resolved: * @@ -1573,7 +1582,8 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, put_cpu_partial(s, page, 0); stat(s, CPU_PARTIAL_NODE); } - if (kmem_cache_debug(s) || available > s->cpu_partial / 2) + if (!kmem_cache_has_cpu_partial(s) + || available > s->cpu_partial / 2) break; } @@ -1884,6 +1894,7 @@ redo: static void unfreeze_partials(struct kmem_cache *s, struct kmem_cache_cpu *c) { +#ifdef CONFIG_SLUB_CPU_PARTIAL struct kmem_cache_node *n = NULL, *n2 = NULL; struct page *page, *discard_page = NULL; @@ -1938,6 +1949,7 @@ static void unfreeze_partials(struct kmem_cache *s, discard_slab(s, page); stat(s, FREE_SLAB); } +#endif } /* @@ -1951,10 +1963,14 @@ static void unfreeze_partials(struct kmem_cache *s, */ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) { +#ifdef CONFIG_SLUB_CPU_PARTIAL struct page *oldpage; int pages; int pobjects; + if (!s->cpu_partial) + return; + do { pages = 0; pobjects = 0; @@ -1987,6 +2003,7 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) page->next = oldpage; } while (this_cpu_cmpxchg(s->cpu_slab->partial, oldpage, page) != oldpage); +#endif } static inline void flush_slab(struct kmem_cache *s, struct kmem_cache_cpu *c) @@ -2358,7 +2375,7 @@ redo: object = c->freelist; page = c->page; - if (unlikely(!object || !node_match(page, node))) + if (unlikely(!object || !page || !node_match(page, node))) object = __slab_alloc(s, gfpflags, node, addr, c); else { @@ -2495,7 +2512,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, new.inuse--; if ((!new.inuse || !prior) && !was_frozen) { - if (!kmem_cache_debug(s) && !prior) + if (kmem_cache_has_cpu_partial(s) && !prior) /* * Slab was on no list before and will be partially empty @@ -2550,8 +2567,9 @@ static void __slab_free(struct kmem_cache *s, struct page *page, * Objects left in the slab. If it was not on the partial list before * then add it. */ - if (kmem_cache_debug(s) && unlikely(!prior)) { - remove_full(s, page); + if (!kmem_cache_has_cpu_partial(s) && unlikely(!prior)) { + if (kmem_cache_debug(s)) + remove_full(s, page); add_partial(n, page, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } @@ -3059,7 +3077,7 @@ static int kmem_cache_open(struct kmem_cache *s, unsigned long flags) * per node list when we run out of per cpu objects. We only fetch 50% * to keep some capacity around for frees. */ - if (kmem_cache_debug(s)) + if (!kmem_cache_has_cpu_partial(s)) s->cpu_partial = 0; else if (s->size >= PAGE_SIZE) s->cpu_partial = 2; @@ -4456,7 +4474,7 @@ static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf, err = strict_strtoul(buf, 10, &objects); if (err) return err; - if (objects && kmem_cache_debug(s)) + if (objects && !kmem_cache_has_cpu_partial(s)) return -EINVAL; s->cpu_partial = objects; @@ -5269,7 +5287,6 @@ __initcall(slab_sysfs_init); #ifdef CONFIG_SLABINFO void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) { - unsigned long nr_partials = 0; unsigned long nr_slabs = 0; unsigned long nr_objs = 0; unsigned long nr_free = 0; @@ -5281,9 +5298,8 @@ void get_slabinfo(struct kmem_cache *s, struct slabinfo *sinfo) if (!n) continue; - nr_partials += n->nr_partial; - nr_slabs += atomic_long_read(&n->nr_slabs); - nr_objs += atomic_long_read(&n->total_objects); + nr_slabs += node_nr_slabs(n); + nr_objs += node_nr_objs(n); nr_free += count_partial(n, count_free); } diff --git a/mm/sparse.c b/mm/sparse.c index 1c91f0d3..308d5033 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -79,7 +79,6 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) { unsigned long root = SECTION_NR_TO_ROOT(section_nr); struct mem_section *section; - int ret = 0; if (mem_section[root]) return -EEXIST; @@ -90,7 +89,7 @@ static int __meminit sparse_index_init(unsigned long section_nr, int nid) mem_section[root] = section; - return ret; + return 0; } #else /* !SPARSEMEM_EXTREME */ static inline int sparse_index_init(unsigned long section_nr, int nid) @@ -481,6 +480,9 @@ void __init sparse_init(void) struct page **map_map; #endif + /* see include/linux/mmzone.h 'struct mem_section' definition */ + BUILD_BUG_ON(!is_power_of_2(sizeof(struct mem_section))); + /* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */ set_pageblock_order(); @@ -751,6 +753,7 @@ out: return ret; } +#ifdef CONFIG_MEMORY_HOTREMOVE #ifdef CONFIG_MEMORY_FAILURE static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) { @@ -772,7 +775,6 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages) } #endif -#ifdef CONFIG_MEMORY_HOTREMOVE static void free_section_usemap(struct page *memmap, unsigned long *usemap) { struct page *usemap_page; @@ -34,10 +34,13 @@ #include "internal.h" +#define CREATE_TRACE_POINTS +#include <trace/events/pagemap.h> + /* How many pages do we try to swap or page in/out together? */ int page_cluster; -static DEFINE_PER_CPU(struct pagevec[NR_LRU_LISTS], lru_add_pvecs); +static DEFINE_PER_CPU(struct pagevec, lru_add_pvec); static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs); static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs); @@ -384,6 +387,7 @@ static void __activate_page(struct page *page, struct lruvec *lruvec, SetPageActive(page); lru += LRU_ACTIVE; add_page_to_lru_list(page, lruvec, lru); + trace_mm_lru_activate(page, page_to_pfn(page)); __count_vm_event(PGACTIVATE); update_page_reclaim_stat(lruvec, file, 1); @@ -428,6 +432,33 @@ void activate_page(struct page *page) } #endif +static void __lru_cache_activate_page(struct page *page) +{ + struct pagevec *pvec = &get_cpu_var(lru_add_pvec); + int i; + + /* + * Search backwards on the optimistic assumption that the page being + * activated has just been added to this pagevec. Note that only + * the local pagevec is examined as a !PageLRU page could be in the + * process of being released, reclaimed, migrated or on a remote + * pagevec that is currently being drained. Furthermore, marking + * a remote pagevec's page PageActive potentially hits a race where + * a page is marked PageActive just after it is added to the inactive + * list causing accounting errors and BUG_ON checks to trigger. + */ + for (i = pagevec_count(pvec) - 1; i >= 0; i--) { + struct page *pagevec_page = pvec->pages[i]; + + if (pagevec_page == page) { + SetPageActive(page); + break; + } + } + + put_cpu_var(lru_add_pvec); +} + /* * Mark a page as having seen activity. * @@ -438,8 +469,18 @@ void activate_page(struct page *page) void mark_page_accessed(struct page *page) { if (!PageActive(page) && !PageUnevictable(page) && - PageReferenced(page) && PageLRU(page)) { - activate_page(page); + PageReferenced(page)) { + + /* + * If the page is on the LRU, queue it for activation via + * activate_page_pvecs. Otherwise, assume the page is on a + * pagevec, mark it active and it'll be moved to the active + * LRU on the next drain. + */ + if (PageLRU(page)) + activate_page(page); + else + __lru_cache_activate_page(page); ClearPageReferenced(page); } else if (!PageReferenced(page)) { SetPageReferenced(page); @@ -448,42 +489,37 @@ void mark_page_accessed(struct page *page) EXPORT_SYMBOL(mark_page_accessed); /* - * Order of operations is important: flush the pagevec when it's already - * full, not when adding the last page, to make sure that last page is - * not added to the LRU directly when passed to this function. Because - * mark_page_accessed() (called after this when writing) only activates - * pages that are on the LRU, linear writes in subpage chunks would see - * every PAGEVEC_SIZE page activated, which is unexpected. + * Queue the page for addition to the LRU via pagevec. The decision on whether + * to add the page to the [in]active [file|anon] list is deferred until the + * pagevec is drained. This gives a chance for the caller of __lru_cache_add() + * have the page added to the active list using mark_page_accessed(). */ -void __lru_cache_add(struct page *page, enum lru_list lru) +void __lru_cache_add(struct page *page) { - struct pagevec *pvec = &get_cpu_var(lru_add_pvecs)[lru]; + struct pagevec *pvec = &get_cpu_var(lru_add_pvec); page_cache_get(page); if (!pagevec_space(pvec)) - __pagevec_lru_add(pvec, lru); + __pagevec_lru_add(pvec); pagevec_add(pvec, page); - put_cpu_var(lru_add_pvecs); + put_cpu_var(lru_add_pvec); } EXPORT_SYMBOL(__lru_cache_add); /** - * lru_cache_add_lru - add a page to a page list + * lru_cache_add - add a page to a page list * @page: the page to be added to the LRU. - * @lru: the LRU list to which the page is added. */ -void lru_cache_add_lru(struct page *page, enum lru_list lru) +void lru_cache_add(struct page *page) { if (PageActive(page)) { VM_BUG_ON(PageUnevictable(page)); - ClearPageActive(page); } else if (PageUnevictable(page)) { VM_BUG_ON(PageActive(page)); - ClearPageUnevictable(page); } - VM_BUG_ON(PageLRU(page) || PageActive(page) || PageUnevictable(page)); - __lru_cache_add(page, lru); + VM_BUG_ON(PageLRU(page)); + __lru_cache_add(page); } /** @@ -583,15 +619,10 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec, */ void lru_add_drain_cpu(int cpu) { - struct pagevec *pvecs = per_cpu(lru_add_pvecs, cpu); - struct pagevec *pvec; - int lru; + struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu); - for_each_lru(lru) { - pvec = &pvecs[lru - LRU_BASE]; - if (pagevec_count(pvec)) - __pagevec_lru_add(pvec, lru); - } + if (pagevec_count(pvec)) + __pagevec_lru_add(pvec); pvec = &per_cpu(lru_rotate_pvecs, cpu); if (pagevec_count(pvec)) { @@ -708,6 +739,9 @@ void release_pages(struct page **pages, int nr, int cold) del_page_from_lru_list(page, lruvec, page_off_lru(page)); } + /* Clear Active bit in case of parallel mark_page_accessed */ + ClearPageActive(page); + list_add(&page->lru, &pages_to_free); } if (zone) @@ -795,30 +829,26 @@ void lru_add_page_tail(struct page *page, struct page *page_tail, static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec, void *arg) { - enum lru_list lru = (enum lru_list)arg; - int file = is_file_lru(lru); - int active = is_active_lru(lru); + int file = page_is_file_cache(page); + int active = PageActive(page); + enum lru_list lru = page_lru(page); - VM_BUG_ON(PageActive(page)); VM_BUG_ON(PageUnevictable(page)); VM_BUG_ON(PageLRU(page)); SetPageLRU(page); - if (active) - SetPageActive(page); add_page_to_lru_list(page, lruvec, lru); update_page_reclaim_stat(lruvec, file, active); + trace_mm_lru_insertion(page, page_to_pfn(page), lru, trace_pagemap_flags(page)); } /* * Add the passed pages to the LRU, then drop the caller's refcount * on them. Reinitialises the caller's pagevec. */ -void __pagevec_lru_add(struct pagevec *pvec, enum lru_list lru) +void __pagevec_lru_add(struct pagevec *pvec) { - VM_BUG_ON(is_unevictable_lru(lru)); - - pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, (void *)lru); + pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL); } EXPORT_SYMBOL(__pagevec_lru_add); diff --git a/mm/swapfile.c b/mm/swapfile.c index 746af55b..36af6ee 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -212,7 +212,7 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, si->cluster_nr = SWAPFILE_CLUSTER - 1; goto checks; } - if (si->flags & SWP_DISCARDABLE) { + if (si->flags & SWP_PAGE_DISCARD) { /* * Start range check on racing allocations, in case * they overlap the cluster we eventually decide on @@ -322,7 +322,7 @@ checks: if (si->lowest_alloc) { /* - * Only set when SWP_DISCARDABLE, and there's a scan + * Only set when SWP_PAGE_DISCARD, and there's a scan * for a free cluster in progress or just completed. */ if (found_free_cluster) { @@ -2016,6 +2016,20 @@ static int setup_swap_map_and_extents(struct swap_info_struct *p, return nr_extents; } +/* + * Helper to sys_swapon determining if a given swap + * backing device queue supports DISCARD operations. + */ +static bool swap_discardable(struct swap_info_struct *si) +{ + struct request_queue *q = bdev_get_queue(si->bdev); + + if (!q || !blk_queue_discard(q)) + return false; + + return true; +} + SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct *p; @@ -2123,8 +2137,37 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->flags |= SWP_SOLIDSTATE; p->cluster_next = 1 + (prandom_u32() % p->highest_bit); } - if ((swap_flags & SWAP_FLAG_DISCARD) && discard_swap(p) == 0) - p->flags |= SWP_DISCARDABLE; + + if ((swap_flags & SWAP_FLAG_DISCARD) && swap_discardable(p)) { + /* + * When discard is enabled for swap with no particular + * policy flagged, we set all swap discard flags here in + * order to sustain backward compatibility with older + * swapon(8) releases. + */ + p->flags |= (SWP_DISCARDABLE | SWP_AREA_DISCARD | + SWP_PAGE_DISCARD); + + /* + * By flagging sys_swapon, a sysadmin can tell us to + * either do single-time area discards only, or to just + * perform discards for released swap page-clusters. + * Now it's time to adjust the p->flags accordingly. + */ + if (swap_flags & SWAP_FLAG_DISCARD_ONCE) + p->flags &= ~SWP_PAGE_DISCARD; + else if (swap_flags & SWAP_FLAG_DISCARD_PAGES) + p->flags &= ~SWP_AREA_DISCARD; + + /* issue a swapon-time discard if it's still required */ + if (p->flags & SWP_AREA_DISCARD) { + int err = discard_swap(p); + if (unlikely(err)) + printk(KERN_ERR + "swapon: discard_swap(%p): %d\n", + p, err); + } + } } mutex_lock(&swapon_mutex); @@ -2135,11 +2178,13 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) enable_swap_info(p, prio, swap_map, frontswap_map); printk(KERN_INFO "Adding %uk swap on %s. " - "Priority:%d extents:%d across:%lluk %s%s%s\n", + "Priority:%d extents:%d across:%lluk %s%s%s%s%s\n", p->pages<<(PAGE_SHIFT-10), name->name, p->prio, nr_extents, (unsigned long long)span<<(PAGE_SHIFT-10), (p->flags & SWP_SOLIDSTATE) ? "SS" : "", (p->flags & SWP_DISCARDABLE) ? "D" : "", + (p->flags & SWP_AREA_DISCARD) ? "s" : "", + (p->flags & SWP_PAGE_DISCARD) ? "c" : "", (frontswap_map) ? "FS" : ""); mutex_unlock(&swapon_mutex); @@ -295,7 +295,6 @@ void arch_pick_mmap_layout(struct mm_struct *mm) { mm->mmap_base = TASK_UNMAPPED_BASE; mm->get_unmapped_area = arch_get_unmapped_area; - mm->unmap_area = arch_unmap_area; } #endif diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d365724..13a5495 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -292,7 +292,7 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) va = rb_entry(n, struct vmap_area, rb_node); if (addr < va->va_start) n = n->rb_left; - else if (addr > va->va_start) + else if (addr >= va->va_end) n = n->rb_right; else return va; @@ -388,12 +388,12 @@ nocache: addr = ALIGN(first->va_end, align); if (addr < vstart) goto nocache; - if (addr + size - 1 < addr) + if (addr + size < addr) goto overflow; } else { addr = ALIGN(vstart, align); - if (addr + size - 1 < addr) + if (addr + size < addr) goto overflow; n = vmap_area_root.rb_node; @@ -420,7 +420,7 @@ nocache: if (addr + cached_hole_size < first->va_start) cached_hole_size = first->va_start - addr; addr = ALIGN(first->va_end, align); - if (addr + size - 1 < addr) + if (addr + size < addr) goto overflow; if (list_is_last(&first->list, &vmap_area_list)) @@ -754,7 +754,6 @@ struct vmap_block { struct vmap_area *va; struct vmap_block_queue *vbq; unsigned long free, dirty; - DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS); DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS); struct list_head free_list; struct rcu_head rcu_head; @@ -820,7 +819,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask) vb->va = va; vb->free = VMAP_BBMAP_BITS; vb->dirty = 0; - bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS); bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS); INIT_LIST_HEAD(&vb->free_list); @@ -873,7 +871,6 @@ static void purge_fragmented_blocks(int cpu) if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) { vb->free = 0; /* prevent further allocs after releasing lock */ vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */ - bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS); bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS); spin_lock(&vbq->lock); list_del_rcu(&vb->free_list); @@ -891,11 +888,6 @@ static void purge_fragmented_blocks(int cpu) } } -static void purge_fragmented_blocks_thiscpu(void) -{ - purge_fragmented_blocks(smp_processor_id()); -} - static void purge_fragmented_blocks_allcpus(void) { int cpu; @@ -910,7 +902,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) struct vmap_block *vb; unsigned long addr = 0; unsigned int order; - int purge = 0; BUG_ON(size & ~PAGE_MASK); BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC); @@ -934,17 +925,7 @@ again: if (vb->free < 1UL << order) goto next; - i = bitmap_find_free_region(vb->alloc_map, - VMAP_BBMAP_BITS, order); - - if (i < 0) { - if (vb->free + vb->dirty == VMAP_BBMAP_BITS) { - /* fragmented and no outstanding allocations */ - BUG_ON(vb->dirty != VMAP_BBMAP_BITS); - purge = 1; - } - goto next; - } + i = VMAP_BBMAP_BITS - vb->free; addr = vb->va->va_start + (i << PAGE_SHIFT); BUG_ON(addr_to_vb_idx(addr) != addr_to_vb_idx(vb->va->va_start)); @@ -960,9 +941,6 @@ next: spin_unlock(&vb->lock); } - if (purge) - purge_fragmented_blocks_thiscpu(); - put_cpu_var(vmap_block_queue); rcu_read_unlock(); @@ -1311,22 +1289,15 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, spin_unlock(&vmap_area_lock); } -static void clear_vm_unlist(struct vm_struct *vm) +static void clear_vm_uninitialized_flag(struct vm_struct *vm) { /* - * Before removing VM_UNLIST, + * Before removing VM_UNINITIALIZED, * we should make sure that vm has proper values. * Pair with smp_rmb() in show_numa_info(). */ smp_wmb(); - vm->flags &= ~VM_UNLIST; -} - -static void insert_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va, - unsigned long flags, const void *caller) -{ - setup_vmalloc_vm(vm, va, flags, caller); - clear_vm_unlist(vm); + vm->flags &= ~VM_UNINITIALIZED; } static struct vm_struct *__get_vm_area_node(unsigned long size, @@ -1337,16 +1308,8 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, struct vm_struct *area; BUG_ON(in_interrupt()); - if (flags & VM_IOREMAP) { - int bit = fls(size); - - if (bit > IOREMAP_MAX_ORDER) - bit = IOREMAP_MAX_ORDER; - else if (bit < PAGE_SHIFT) - bit = PAGE_SHIFT; - - align = 1ul << bit; - } + if (flags & VM_IOREMAP) + align = 1ul << clamp(fls(size), PAGE_SHIFT, IOREMAP_MAX_ORDER); size = PAGE_ALIGN(size); if (unlikely(!size)) @@ -1367,16 +1330,7 @@ static struct vm_struct *__get_vm_area_node(unsigned long size, return NULL; } - /* - * When this function is called from __vmalloc_node_range, - * we add VM_UNLIST flag to avoid accessing uninitialized - * members of vm_struct such as pages and nr_pages fields. - * They will be set later. - */ - if (flags & VM_UNLIST) - setup_vmalloc_vm(area, va, flags, caller); - else - insert_vmalloc_vm(area, va, flags, caller); + setup_vmalloc_vm(area, va, flags, caller); return area; } @@ -1476,10 +1430,9 @@ static void __vunmap(const void *addr, int deallocate_pages) if (!addr) return; - if ((PAGE_SIZE-1) & (unsigned long)addr) { - WARN(1, KERN_ERR "Trying to vfree() bad address (%p)\n", addr); + if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n", + addr)) return; - } area = remove_vm_area(addr); if (unlikely(!area)) { @@ -1524,7 +1477,6 @@ static void __vunmap(const void *addr, int deallocate_pages) * conventions for vfree() arch-depenedent would be a really bad idea) * * NOTE: assumes that the object at *addr has a size >= sizeof(llist_node) - * */ void vfree(const void *addr) { @@ -1536,8 +1488,8 @@ void vfree(const void *addr) return; if (unlikely(in_interrupt())) { struct vfree_deferred *p = &__get_cpu_var(vfree_deferred); - llist_add((struct llist_node *)addr, &p->list); - schedule_work(&p->wq); + if (llist_add((struct llist_node *)addr, &p->list)) + schedule_work(&p->wq); } else __vunmap(addr, 1); } @@ -1682,21 +1634,21 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, if (!size || (size >> PAGE_SHIFT) > totalram_pages) goto fail; - area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST, + area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED, start, end, node, gfp_mask, caller); if (!area) goto fail; addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller); if (!addr) - return NULL; + goto fail; /* - * In this function, newly allocated vm_struct has VM_UNLIST flag. - * It means that vm_struct is not fully initialized. + * In this function, newly allocated vm_struct has VM_UNINITIALIZED + * flag. It means that vm_struct is not fully initialized. * Now, it is fully initialized, so remove this flag here. */ - clear_vm_unlist(area); + clear_vm_uninitialized_flag(area); /* * A ref_count = 3 is needed because the vm_struct and vmap_area @@ -2148,42 +2100,43 @@ finished: } /** - * remap_vmalloc_range - map vmalloc pages to userspace - * @vma: vma to cover (map full range of vma) - * @addr: vmalloc memory - * @pgoff: number of pages into addr before first page to map + * remap_vmalloc_range_partial - map vmalloc pages to userspace + * @vma: vma to cover + * @uaddr: target user address to start at + * @kaddr: virtual address of vmalloc kernel memory + * @size: size of map area * * Returns: 0 for success, -Exxx on failure * - * This function checks that addr is a valid vmalloc'ed area, and - * that it is big enough to cover the vma. Will return failure if - * that criteria isn't met. + * This function checks that @kaddr is a valid vmalloc'ed area, + * and that it is big enough to cover the range starting at + * @uaddr in @vma. Will return failure if that criteria isn't + * met. * * Similar to remap_pfn_range() (see mm/memory.c) */ -int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, - unsigned long pgoff) +int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, + void *kaddr, unsigned long size) { struct vm_struct *area; - unsigned long uaddr = vma->vm_start; - unsigned long usize = vma->vm_end - vma->vm_start; - if ((PAGE_SIZE-1) & (unsigned long)addr) + size = PAGE_ALIGN(size); + + if (!PAGE_ALIGNED(uaddr) || !PAGE_ALIGNED(kaddr)) return -EINVAL; - area = find_vm_area(addr); + area = find_vm_area(kaddr); if (!area) return -EINVAL; if (!(area->flags & VM_USERMAP)) return -EINVAL; - if (usize + (pgoff << PAGE_SHIFT) > area->size - PAGE_SIZE) + if (kaddr + size > area->addr + area->size) return -EINVAL; - addr += pgoff << PAGE_SHIFT; do { - struct page *page = vmalloc_to_page(addr); + struct page *page = vmalloc_to_page(kaddr); int ret; ret = vm_insert_page(vma, uaddr, page); @@ -2191,14 +2144,37 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, return ret; uaddr += PAGE_SIZE; - addr += PAGE_SIZE; - usize -= PAGE_SIZE; - } while (usize > 0); + kaddr += PAGE_SIZE; + size -= PAGE_SIZE; + } while (size > 0); vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; return 0; } +EXPORT_SYMBOL(remap_vmalloc_range_partial); + +/** + * remap_vmalloc_range - map vmalloc pages to userspace + * @vma: vma to cover (map full range of vma) + * @addr: vmalloc memory + * @pgoff: number of pages into addr before first page to map + * + * Returns: 0 for success, -Exxx on failure + * + * This function checks that addr is a valid vmalloc'ed area, and + * that it is big enough to cover the vma. Will return failure if + * that criteria isn't met. + * + * Similar to remap_pfn_range() (see mm/memory.c) + */ +int remap_vmalloc_range(struct vm_area_struct *vma, void *addr, + unsigned long pgoff) +{ + return remap_vmalloc_range_partial(vma, vma->vm_start, + addr + (pgoff << PAGE_SHIFT), + vma->vm_end - vma->vm_start); +} EXPORT_SYMBOL(remap_vmalloc_range); /* @@ -2512,8 +2488,8 @@ found: /* insert all vm's */ for (area = 0; area < nr_vms; area++) - insert_vmalloc_vm(vms[area], vas[area], VM_ALLOC, - pcpu_get_vm_areas); + setup_vmalloc_vm(vms[area], vas[area], VM_ALLOC, + pcpu_get_vm_areas); kfree(vas); return vms; @@ -2592,11 +2568,6 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v) if (!counters) return; - /* Pair with smp_wmb() in clear_vm_unlist() */ - smp_rmb(); - if (v->flags & VM_UNLIST) - return; - memset(counters, 0, nr_node_ids * sizeof(unsigned int)); for (nr = 0; nr < v->nr_pages; nr++) @@ -2625,6 +2596,11 @@ static int s_show(struct seq_file *m, void *p) v = va->vm; + /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */ + smp_rmb(); + if (v->flags & VM_UNINITIALIZED) + return 0; + seq_printf(m, "0x%pK-0x%pK %7ld", v->addr, v->addr + v->size, v->size); diff --git a/mm/vmscan.c b/mm/vmscan.c index fa6a853..2cff0d4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -546,7 +546,6 @@ int remove_mapping(struct address_space *mapping, struct page *page) void putback_lru_page(struct page *page) { int lru; - int active = !!TestClearPageActive(page); int was_unevictable = PageUnevictable(page); VM_BUG_ON(PageLRU(page)); @@ -561,8 +560,8 @@ redo: * unevictable page on [in]active list. * We know how to handle that. */ - lru = active + page_lru_base_type(page); - lru_cache_add_lru(page, lru); + lru = page_lru_base_type(page); + lru_cache_add(page); } else { /* * Put unevictable pages directly on zone's unevictable @@ -669,6 +668,35 @@ static enum page_references page_check_references(struct page *page, return PAGEREF_RECLAIM; } +/* Check if a page is dirty or under writeback */ +static void page_check_dirty_writeback(struct page *page, + bool *dirty, bool *writeback) +{ + struct address_space *mapping; + + /* + * Anonymous pages are not handled by flushers and must be written + * from reclaim context. Do not stall reclaim based on them + */ + if (!page_is_file_cache(page)) { + *dirty = false; + *writeback = false; + return; + } + + /* By default assume that the page flags are accurate */ + *dirty = PageDirty(page); + *writeback = PageWriteback(page); + + /* Verify dirty/writeback state if the filesystem supports it */ + if (!page_has_private(page)) + return; + + mapping = page_mapping(page); + if (mapping && mapping->a_ops->is_dirty_writeback) + mapping->a_ops->is_dirty_writeback(page, dirty, writeback); +} + /* * shrink_page_list() returns the number of reclaimed pages */ @@ -677,16 +705,21 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct scan_control *sc, enum ttu_flags ttu_flags, unsigned long *ret_nr_dirty, + unsigned long *ret_nr_unqueued_dirty, + unsigned long *ret_nr_congested, unsigned long *ret_nr_writeback, + unsigned long *ret_nr_immediate, bool force_reclaim) { LIST_HEAD(ret_pages); LIST_HEAD(free_pages); int pgactivate = 0; + unsigned long nr_unqueued_dirty = 0; unsigned long nr_dirty = 0; unsigned long nr_congested = 0; unsigned long nr_reclaimed = 0; unsigned long nr_writeback = 0; + unsigned long nr_immediate = 0; cond_resched(); @@ -696,6 +729,7 @@ static unsigned long shrink_page_list(struct list_head *page_list, struct page *page; int may_enter_fs; enum page_references references = PAGEREF_RECLAIM_CLEAN; + bool dirty, writeback; cond_resched(); @@ -723,25 +757,77 @@ static unsigned long shrink_page_list(struct list_head *page_list, may_enter_fs = (sc->gfp_mask & __GFP_FS) || (PageSwapCache(page) && (sc->gfp_mask & __GFP_IO)); + /* + * The number of dirty pages determines if a zone is marked + * reclaim_congested which affects wait_iff_congested. kswapd + * will stall and start writing pages if the tail of the LRU + * is all dirty unqueued pages. + */ + page_check_dirty_writeback(page, &dirty, &writeback); + if (dirty || writeback) + nr_dirty++; + + if (dirty && !writeback) + nr_unqueued_dirty++; + + /* + * Treat this page as congested if the underlying BDI is or if + * pages are cycling through the LRU so quickly that the + * pages marked for immediate reclaim are making it to the + * end of the LRU a second time. + */ + mapping = page_mapping(page); + if ((mapping && bdi_write_congested(mapping->backing_dev_info)) || + (writeback && PageReclaim(page))) + nr_congested++; + + /* + * If a page at the tail of the LRU is under writeback, there + * are three cases to consider. + * + * 1) If reclaim is encountering an excessive number of pages + * under writeback and this page is both under writeback and + * PageReclaim then it indicates that pages are being queued + * for IO but are being recycled through the LRU before the + * IO can complete. Waiting on the page itself risks an + * indefinite stall if it is impossible to writeback the + * page due to IO error or disconnected storage so instead + * note that the LRU is being scanned too quickly and the + * caller can stall after page list has been processed. + * + * 2) Global reclaim encounters a page, memcg encounters a + * page that is not marked for immediate reclaim or + * the caller does not have __GFP_IO. In this case mark + * the page for immediate reclaim and continue scanning. + * + * __GFP_IO is checked because a loop driver thread might + * enter reclaim, and deadlock if it waits on a page for + * which it is needed to do the write (loop masks off + * __GFP_IO|__GFP_FS for this reason); but more thought + * would probably show more reasons. + * + * Don't require __GFP_FS, since we're not going into the + * FS, just waiting on its writeback completion. Worryingly, + * ext4 gfs2 and xfs allocate pages with + * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so testing + * may_enter_fs here is liable to OOM on them. + * + * 3) memcg encounters a page that is not already marked + * PageReclaim. memcg does not have any dirty pages + * throttling so we could easily OOM just because too many + * pages are in writeback and there is nothing else to + * reclaim. Wait for the writeback to complete. + */ if (PageWriteback(page)) { - /* - * memcg doesn't have any dirty pages throttling so we - * could easily OOM just because too many pages are in - * writeback and there is nothing else to reclaim. - * - * Check __GFP_IO, certainly because a loop driver - * thread might enter reclaim, and deadlock if it waits - * on a page for which it is needed to do the write - * (loop masks off __GFP_IO|__GFP_FS for this reason); - * but more thought would probably show more reasons. - * - * Don't require __GFP_FS, since we're not going into - * the FS, just waiting on its writeback completion. - * Worryingly, ext4 gfs2 and xfs allocate pages with - * grab_cache_page_write_begin(,,AOP_FLAG_NOFS), so - * testing may_enter_fs here is liable to OOM on them. - */ - if (global_reclaim(sc) || + /* Case 1 above */ + if (current_is_kswapd() && + PageReclaim(page) && + zone_is_reclaim_writeback(zone)) { + nr_immediate++; + goto keep_locked; + + /* Case 2 above */ + } else if (global_reclaim(sc) || !PageReclaim(page) || !(sc->gfp_mask & __GFP_IO)) { /* * This is slightly racy - end_page_writeback() @@ -756,9 +842,13 @@ static unsigned long shrink_page_list(struct list_head *page_list, */ SetPageReclaim(page); nr_writeback++; + goto keep_locked; + + /* Case 3 above */ + } else { + wait_on_page_writeback(page); } - wait_on_page_writeback(page); } if (!force_reclaim) @@ -784,9 +874,10 @@ static unsigned long shrink_page_list(struct list_head *page_list, if (!add_to_swap(page, page_list)) goto activate_locked; may_enter_fs = 1; - } - mapping = page_mapping(page); + /* Adding to swap updated mapping */ + mapping = page_mapping(page); + } /* * The page is mapped into the page tables of one or more @@ -806,16 +897,14 @@ static unsigned long shrink_page_list(struct list_head *page_list, } if (PageDirty(page)) { - nr_dirty++; - /* * Only kswapd can writeback filesystem pages to - * avoid risk of stack overflow but do not writeback - * unless under significant pressure. + * avoid risk of stack overflow but only writeback + * if many dirty pages have been encountered. */ if (page_is_file_cache(page) && (!current_is_kswapd() || - sc->priority >= DEF_PRIORITY - 2)) { + !zone_is_reclaim_dirty(zone))) { /* * Immediately reclaim when written back. * Similar in principal to deactivate_page() @@ -838,7 +927,6 @@ static unsigned long shrink_page_list(struct list_head *page_list, /* Page is dirty, try to write it out here */ switch (pageout(page, mapping, sc)) { case PAGE_KEEP: - nr_congested++; goto keep_locked; case PAGE_ACTIVATE: goto activate_locked; @@ -946,22 +1034,16 @@ keep: VM_BUG_ON(PageLRU(page) || PageUnevictable(page)); } - /* - * Tag a zone as congested if all the dirty pages encountered were - * backed by a congested BDI. In this case, reclaimers should just - * back off and wait for congestion to clear because further reclaim - * will encounter the same problem - */ - if (nr_dirty && nr_dirty == nr_congested && global_reclaim(sc)) - zone_set_flag(zone, ZONE_CONGESTED); - free_hot_cold_page_list(&free_pages, 1); list_splice(&ret_pages, page_list); count_vm_events(PGACTIVATE, pgactivate); mem_cgroup_uncharge_end(); *ret_nr_dirty += nr_dirty; + *ret_nr_congested += nr_congested; + *ret_nr_unqueued_dirty += nr_unqueued_dirty; *ret_nr_writeback += nr_writeback; + *ret_nr_immediate += nr_immediate; return nr_reclaimed; } @@ -973,7 +1055,7 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, .priority = DEF_PRIORITY, .may_unmap = 1, }; - unsigned long ret, dummy1, dummy2; + unsigned long ret, dummy1, dummy2, dummy3, dummy4, dummy5; struct page *page, *next; LIST_HEAD(clean_pages); @@ -985,8 +1067,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone, } ret = shrink_page_list(&clean_pages, zone, &sc, - TTU_UNMAP|TTU_IGNORE_ACCESS, - &dummy1, &dummy2, true); + TTU_UNMAP|TTU_IGNORE_ACCESS, + &dummy1, &dummy2, &dummy3, &dummy4, &dummy5, true); list_splice(&clean_pages, page_list); __mod_zone_page_state(zone, NR_ISOLATED_FILE, -ret); return ret; @@ -1281,7 +1363,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, unsigned long nr_reclaimed = 0; unsigned long nr_taken; unsigned long nr_dirty = 0; + unsigned long nr_congested = 0; + unsigned long nr_unqueued_dirty = 0; unsigned long nr_writeback = 0; + unsigned long nr_immediate = 0; isolate_mode_t isolate_mode = 0; int file = is_file_lru(lru); struct zone *zone = lruvec_zone(lruvec); @@ -1323,7 +1408,9 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, return 0; nr_reclaimed = shrink_page_list(&page_list, zone, sc, TTU_UNMAP, - &nr_dirty, &nr_writeback, false); + &nr_dirty, &nr_unqueued_dirty, &nr_congested, + &nr_writeback, &nr_immediate, + false); spin_lock_irq(&zone->lru_lock); @@ -1356,21 +1443,51 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, * as there is no guarantee the dirtying process is throttled in the * same way balance_dirty_pages() manages. * - * This scales the number of dirty pages that must be under writeback - * before throttling depending on priority. It is a simple backoff - * function that has the most effect in the range DEF_PRIORITY to - * DEF_PRIORITY-2 which is the priority reclaim is considered to be - * in trouble and reclaim is considered to be in trouble. - * - * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle - * DEF_PRIORITY-1 50% must be PageWriteback - * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble - * ... - * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any - * isolated page is PageWriteback + * Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number + * of pages under pages flagged for immediate reclaim and stall if any + * are encountered in the nr_immediate check below. + */ + if (nr_writeback && nr_writeback == nr_taken) + zone_set_flag(zone, ZONE_WRITEBACK); + + /* + * memcg will stall in page writeback so only consider forcibly + * stalling for global reclaim */ - if (nr_writeback && nr_writeback >= - (nr_taken >> (DEF_PRIORITY - sc->priority))) + if (global_reclaim(sc)) { + /* + * Tag a zone as congested if all the dirty pages scanned were + * backed by a congested BDI and wait_iff_congested will stall. + */ + if (nr_dirty && nr_dirty == nr_congested) + zone_set_flag(zone, ZONE_CONGESTED); + + /* + * If dirty pages are scanned that are not queued for IO, it + * implies that flushers are not keeping up. In this case, flag + * the zone ZONE_TAIL_LRU_DIRTY and kswapd will start writing + * pages from reclaim context. It will forcibly stall in the + * next check. + */ + if (nr_unqueued_dirty == nr_taken) + zone_set_flag(zone, ZONE_TAIL_LRU_DIRTY); + + /* + * In addition, if kswapd scans pages marked marked for + * immediate reclaim and under writeback (nr_immediate), it + * implies that pages are cycling through the LRU faster than + * they are written so also forcibly stall. + */ + if (nr_unqueued_dirty == nr_taken || nr_immediate) + congestion_wait(BLK_RW_ASYNC, HZ/10); + } + + /* + * Stall direct reclaim for IO completions if underlying BDIs or zone + * is congested. Allow kswapd to continue until it starts encountering + * unqueued dirty pages or cycling through the LRU too quickly. + */ + if (!sc->hibernation_mode && !current_is_kswapd()) wait_iff_congested(zone, BLK_RW_ASYNC, HZ/10); trace_mm_vmscan_lru_shrink_inactive(zone->zone_pgdat->node_id, @@ -1822,17 +1939,25 @@ out: static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) { unsigned long nr[NR_LRU_LISTS]; + unsigned long targets[NR_LRU_LISTS]; unsigned long nr_to_scan; enum lru_list lru; unsigned long nr_reclaimed = 0; unsigned long nr_to_reclaim = sc->nr_to_reclaim; struct blk_plug plug; + bool scan_adjusted = false; get_scan_count(lruvec, sc, nr); + /* Record the original scan target for proportional adjustments later */ + memcpy(targets, nr, sizeof(nr)); + blk_start_plug(&plug); while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] || nr[LRU_INACTIVE_FILE]) { + unsigned long nr_anon, nr_file, percentage; + unsigned long nr_scanned; + for_each_evictable_lru(lru) { if (nr[lru]) { nr_to_scan = min(nr[lru], SWAP_CLUSTER_MAX); @@ -1842,17 +1967,60 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) lruvec, sc); } } + + if (nr_reclaimed < nr_to_reclaim || scan_adjusted) + continue; + /* - * On large memory systems, scan >> priority can become - * really large. This is fine for the starting priority; - * we want to put equal scanning pressure on each zone. - * However, if the VM has a harder time of freeing pages, - * with multiple processes reclaiming pages, the total - * freeing target can get unreasonably large. + * For global direct reclaim, reclaim only the number of pages + * requested. Less care is taken to scan proportionally as it + * is more important to minimise direct reclaim stall latency + * than it is to properly age the LRU lists. */ - if (nr_reclaimed >= nr_to_reclaim && - sc->priority < DEF_PRIORITY) + if (global_reclaim(sc) && !current_is_kswapd()) break; + + /* + * For kswapd and memcg, reclaim at least the number of pages + * requested. Ensure that the anon and file LRUs shrink + * proportionally what was requested by get_scan_count(). We + * stop reclaiming one LRU and reduce the amount scanning + * proportional to the original scan target. + */ + nr_file = nr[LRU_INACTIVE_FILE] + nr[LRU_ACTIVE_FILE]; + nr_anon = nr[LRU_INACTIVE_ANON] + nr[LRU_ACTIVE_ANON]; + + if (nr_file > nr_anon) { + unsigned long scan_target = targets[LRU_INACTIVE_ANON] + + targets[LRU_ACTIVE_ANON] + 1; + lru = LRU_BASE; + percentage = nr_anon * 100 / scan_target; + } else { + unsigned long scan_target = targets[LRU_INACTIVE_FILE] + + targets[LRU_ACTIVE_FILE] + 1; + lru = LRU_FILE; + percentage = nr_file * 100 / scan_target; + } + + /* Stop scanning the smaller of the LRU */ + nr[lru] = 0; + nr[lru + LRU_ACTIVE] = 0; + + /* + * Recalculate the other LRU scan count based on its original + * scan target and the percentage scanning already complete + */ + lru = (lru == LRU_FILE) ? LRU_BASE : LRU_FILE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + lru += LRU_ACTIVE; + nr_scanned = targets[lru] - nr[lru]; + nr[lru] = targets[lru] * (100 - percentage) / 100; + nr[lru] -= min(nr[lru], nr_scanned); + + scan_adjusted = true; } blk_finish_plug(&plug); sc->nr_reclaimed += nr_reclaimed; @@ -2179,8 +2347,10 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, aborted_reclaim = shrink_zones(zonelist, sc); /* - * Don't shrink slabs when reclaiming memory from - * over limit cgroups + * Don't shrink slabs when reclaiming memory from over limit + * cgroups but do shrink slab at least once when aborting + * reclaim for compaction to avoid unevenly scanning file/anon + * LRU pages over slab pages. */ if (global_reclaim(sc)) { unsigned long lru_pages = 0; @@ -2222,18 +2392,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist, WB_REASON_TRY_TO_FREE_PAGES); sc->may_writepage = 1; } - - /* Take a nap, wait for some writeback to complete */ - if (!sc->hibernation_mode && sc->nr_scanned && - sc->priority < DEF_PRIORITY - 2) { - struct zone *preferred_zone; - - first_zones_zonelist(zonelist, gfp_zone(sc->gfp_mask), - &cpuset_current_mems_allowed, - &preferred_zone); - wait_iff_congested(preferred_zone, BLK_RW_ASYNC, HZ/10); - } - } while (--sc->priority >= 0); + } while (--sc->priority >= 0 && !aborted_reclaim); out: delayacct_freepages_end(); @@ -2601,6 +2760,91 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, } /* + * kswapd shrinks the zone by the number of pages required to reach + * the high watermark. + * + * Returns true if kswapd scanned at least the requested number of pages to + * reclaim or if the lack of progress was due to pages under writeback. + * This is used to determine if the scanning priority needs to be raised. + */ +static bool kswapd_shrink_zone(struct zone *zone, + int classzone_idx, + struct scan_control *sc, + unsigned long lru_pages, + unsigned long *nr_attempted) +{ + unsigned long nr_slab; + int testorder = sc->order; + unsigned long balance_gap; + struct reclaim_state *reclaim_state = current->reclaim_state; + struct shrink_control shrink = { + .gfp_mask = sc->gfp_mask, + }; + bool lowmem_pressure; + + /* Reclaim above the high watermark. */ + sc->nr_to_reclaim = max(SWAP_CLUSTER_MAX, high_wmark_pages(zone)); + + /* + * Kswapd reclaims only single pages with compaction enabled. Trying + * too hard to reclaim until contiguous free pages have become + * available can hurt performance by evicting too much useful data + * from memory. Do not reclaim more than needed for compaction. + */ + if (IS_ENABLED(CONFIG_COMPACTION) && sc->order && + compaction_suitable(zone, sc->order) != + COMPACT_SKIPPED) + testorder = 0; + + /* + * We put equal pressure on every zone, unless one zone has way too + * many pages free already. The "too many pages" is defined as the + * high wmark plus a "gap" where the gap is either the low + * watermark or 1% of the zone, whichever is smaller. + */ + balance_gap = min(low_wmark_pages(zone), + (zone->managed_pages + KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / + KSWAPD_ZONE_BALANCE_GAP_RATIO); + + /* + * If there is no low memory pressure or the zone is balanced then no + * reclaim is necessary + */ + lowmem_pressure = (buffer_heads_over_limit && is_highmem(zone)); + if (!lowmem_pressure && zone_balanced(zone, testorder, + balance_gap, classzone_idx)) + return true; + + shrink_zone(zone, sc); + + reclaim_state->reclaimed_slab = 0; + nr_slab = shrink_slab(&shrink, sc->nr_scanned, lru_pages); + sc->nr_reclaimed += reclaim_state->reclaimed_slab; + + /* Account for the number of pages attempted to reclaim */ + *nr_attempted += sc->nr_to_reclaim; + + if (nr_slab == 0 && !zone_reclaimable(zone)) + zone->all_unreclaimable = 1; + + zone_clear_flag(zone, ZONE_WRITEBACK); + + /* + * If a zone reaches its high watermark, consider it to be no longer + * congested. It's possible there are dirty pages backed by congested + * BDIs but as pressure is relieved, speculatively avoid congestion + * waits. + */ + if (!zone->all_unreclaimable && + zone_balanced(zone, testorder, 0, classzone_idx)) { + zone_clear_flag(zone, ZONE_CONGESTED); + zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); + } + + return sc->nr_scanned >= sc->nr_to_reclaim; +} + +/* * For kswapd, balance_pgdat() will work across all this node's zones until * they are all at high_wmark_pages(zone). * @@ -2624,35 +2868,28 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, long remaining, static unsigned long balance_pgdat(pg_data_t *pgdat, int order, int *classzone_idx) { - bool pgdat_is_balanced = false; int i; int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */ - struct reclaim_state *reclaim_state = current->reclaim_state; unsigned long nr_soft_reclaimed; unsigned long nr_soft_scanned; struct scan_control sc = { .gfp_mask = GFP_KERNEL, + .priority = DEF_PRIORITY, .may_unmap = 1, .may_swap = 1, - /* - * kswapd doesn't want to be bailed out while reclaim. because - * we want to put equal scanning pressure on each zone. - */ - .nr_to_reclaim = ULONG_MAX, + .may_writepage = !laptop_mode, .order = order, .target_mem_cgroup = NULL, }; - struct shrink_control shrink = { - .gfp_mask = sc.gfp_mask, - }; -loop_again: - sc.priority = DEF_PRIORITY; - sc.nr_reclaimed = 0; - sc.may_writepage = !laptop_mode; count_vm_event(PAGEOUTRUN); do { unsigned long lru_pages = 0; + unsigned long nr_attempted = 0; + bool raise_priority = true; + bool pgdat_needs_compaction = (order > 0); + + sc.nr_reclaimed = 0; /* * Scan in the highmem->dma direction for the highest @@ -2689,23 +2926,46 @@ loop_again: end_zone = i; break; } else { - /* If balanced, clear the congested flag */ + /* + * If balanced, clear the dirty and congested + * flags + */ zone_clear_flag(zone, ZONE_CONGESTED); + zone_clear_flag(zone, ZONE_TAIL_LRU_DIRTY); } } - if (i < 0) { - pgdat_is_balanced = true; + if (i < 0) goto out; - } for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; + if (!populated_zone(zone)) + continue; + lru_pages += zone_reclaimable_pages(zone); + + /* + * If any zone is currently balanced then kswapd will + * not call compaction as it is expected that the + * necessary pages are already available. + */ + if (pgdat_needs_compaction && + zone_watermark_ok(zone, order, + low_wmark_pages(zone), + *classzone_idx, 0)) + pgdat_needs_compaction = false; } /* + * If we're getting trouble reclaiming, start doing writepage + * even in laptop mode. + */ + if (sc.priority < DEF_PRIORITY - 2) + sc.may_writepage = 1; + + /* * Now scan the zone in the dma->highmem direction, stopping * at the last zone which needs scanning. * @@ -2716,8 +2976,6 @@ loop_again: */ for (i = 0; i <= end_zone; i++) { struct zone *zone = pgdat->node_zones + i; - int nr_slab, testorder; - unsigned long balance_gap; if (!populated_zone(zone)) continue; @@ -2738,65 +2996,14 @@ loop_again: sc.nr_reclaimed += nr_soft_reclaimed; /* - * We put equal pressure on every zone, unless - * one zone has way too many pages free - * already. The "too many pages" is defined - * as the high wmark plus a "gap" where the - * gap is either the low watermark or 1% - * of the zone, whichever is smaller. + * There should be no need to raise the scanning + * priority if enough pages are already being scanned + * that that high watermark would be met at 100% + * efficiency. */ - balance_gap = min(low_wmark_pages(zone), - (zone->managed_pages + - KSWAPD_ZONE_BALANCE_GAP_RATIO-1) / - KSWAPD_ZONE_BALANCE_GAP_RATIO); - /* - * Kswapd reclaims only single pages with compaction - * enabled. Trying too hard to reclaim until contiguous - * free pages have become available can hurt performance - * by evicting too much useful data from memory. - * Do not reclaim more than needed for compaction. - */ - testorder = order; - if (IS_ENABLED(CONFIG_COMPACTION) && order && - compaction_suitable(zone, order) != - COMPACT_SKIPPED) - testorder = 0; - - if ((buffer_heads_over_limit && is_highmem_idx(i)) || - !zone_balanced(zone, testorder, - balance_gap, end_zone)) { - shrink_zone(zone, &sc); - - reclaim_state->reclaimed_slab = 0; - nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); - sc.nr_reclaimed += reclaim_state->reclaimed_slab; - - if (nr_slab == 0 && !zone_reclaimable(zone)) - zone->all_unreclaimable = 1; - } - - /* - * If we're getting trouble reclaiming, start doing - * writepage even in laptop mode. - */ - if (sc.priority < DEF_PRIORITY - 2) - sc.may_writepage = 1; - - if (zone->all_unreclaimable) { - if (end_zone && end_zone == i) - end_zone--; - continue; - } - - if (zone_balanced(zone, testorder, 0, end_zone)) - /* - * If a zone reaches its high watermark, - * consider it to be no longer congested. It's - * possible there are dirty pages backed by - * congested BDIs but as pressure is relieved, - * speculatively avoid congestion waits - */ - zone_clear_flag(zone, ZONE_CONGESTED); + if (kswapd_shrink_zone(zone, end_zone, &sc, + lru_pages, &nr_attempted)) + raise_priority = false; } /* @@ -2808,74 +3015,38 @@ loop_again: pfmemalloc_watermark_ok(pgdat)) wake_up(&pgdat->pfmemalloc_wait); - if (pgdat_balanced(pgdat, order, *classzone_idx)) { - pgdat_is_balanced = true; - break; /* kswapd: all done */ - } - /* - * We do this so kswapd doesn't build up large priorities for - * example when it is freeing in parallel with allocators. It - * matches the direct reclaim path behaviour in terms of impact - * on zone->*_priority. + * Fragmentation may mean that the system cannot be rebalanced + * for high-order allocations in all zones. If twice the + * allocation size has been reclaimed and the zones are still + * not balanced then recheck the watermarks at order-0 to + * prevent kswapd reclaiming excessively. Assume that a + * process requested a high-order can direct reclaim/compact. */ - if (sc.nr_reclaimed >= SWAP_CLUSTER_MAX) - break; - } while (--sc.priority >= 0); - -out: - if (!pgdat_is_balanced) { - cond_resched(); + if (order && sc.nr_reclaimed >= 2UL << order) + order = sc.order = 0; - try_to_freeze(); + /* Check if kswapd should be suspending */ + if (try_to_freeze() || kthread_should_stop()) + break; /* - * Fragmentation may mean that the system cannot be - * rebalanced for high-order allocations in all zones. - * At this point, if nr_reclaimed < SWAP_CLUSTER_MAX, - * it means the zones have been fully scanned and are still - * not balanced. For high-order allocations, there is - * little point trying all over again as kswapd may - * infinite loop. - * - * Instead, recheck all watermarks at order-0 as they - * are the most important. If watermarks are ok, kswapd will go - * back to sleep. High-order users can still perform direct - * reclaim if they wish. + * Compact if necessary and kswapd is reclaiming at least the + * high watermark number of pages as requsted */ - if (sc.nr_reclaimed < SWAP_CLUSTER_MAX) - order = sc.order = 0; - - goto loop_again; - } - - /* - * If kswapd was reclaiming at a higher order, it has the option of - * sleeping without all zones being balanced. Before it does, it must - * ensure that the watermarks for order-0 on *all* zones are met and - * that the congestion flags are cleared. The congestion flag must - * be cleared as kswapd is the only mechanism that clears the flag - * and it is potentially going to sleep here. - */ - if (order) { - int zones_need_compaction = 1; - - for (i = 0; i <= end_zone; i++) { - struct zone *zone = pgdat->node_zones + i; - - if (!populated_zone(zone)) - continue; - - /* Check if the memory needs to be defragmented. */ - if (zone_watermark_ok(zone, order, - low_wmark_pages(zone), *classzone_idx, 0)) - zones_need_compaction = 0; - } - - if (zones_need_compaction) + if (pgdat_needs_compaction && sc.nr_reclaimed > nr_attempted) compact_pgdat(pgdat, order); - } + /* + * Raise priority if scanning rate is too low or there was no + * progress in reclaiming pages + */ + if (raise_priority || !sc.nr_reclaimed) + sc.priority--; + } while (sc.priority >= 1 && + !pgdat_balanced(pgdat, order, *classzone_idx)); + +out: /* * Return the order we were reclaiming at so prepare_kswapd_sleep() * makes a decision on the order we were last reclaiming at. However, diff --git a/mm/zbud.c b/mm/zbud.c new file mode 100644 index 0000000..9bb4710 --- /dev/null +++ b/mm/zbud.c @@ -0,0 +1,527 @@ +/* + * zbud.c + * + * Copyright (C) 2013, Seth Jennings, IBM + * + * Concepts based on zcache internal zbud allocator by Dan Magenheimer. + * + * zbud is an special purpose allocator for storing compressed pages. Contrary + * to what its name may suggest, zbud is not a buddy allocator, but rather an + * allocator that "buddies" two compressed pages together in a single memory + * page. + * + * While this design limits storage density, it has simple and deterministic + * reclaim properties that make it preferable to a higher density approach when + * reclaim will be used. + * + * zbud works by storing compressed pages, or "zpages", together in pairs in a + * single memory page called a "zbud page". The first buddy is "left + * justifed" at the beginning of the zbud page, and the last buddy is "right + * justified" at the end of the zbud page. The benefit is that if either + * buddy is freed, the freed buddy space, coalesced with whatever slack space + * that existed between the buddies, results in the largest possible free region + * within the zbud page. + * + * zbud also provides an attractive lower bound on density. The ratio of zpages + * to zbud pages can not be less than 1. This ensures that zbud can never "do + * harm" by using more pages to store zpages than the uncompressed zpages would + * have used on their own. + * + * zbud pages are divided into "chunks". The size of the chunks is fixed at + * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages + * into chunks allows organizing unbuddied zbud pages into a manageable number + * of unbuddied lists according to the number of free chunks available in the + * zbud page. + * + * The zbud API differs from that of conventional allocators in that the + * allocation function, zbud_alloc(), returns an opaque handle to the user, + * not a dereferenceable pointer. The user must map the handle using + * zbud_map() in order to get a usable pointer by which to access the + * allocation data and unmap the handle with zbud_unmap() when operations + * on the allocation data are complete. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/atomic.h> +#include <linux/list.h> +#include <linux/mm.h> +#include <linux/module.h> +#include <linux/preempt.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/zbud.h> + +/***************** + * Structures +*****************/ +/* + * NCHUNKS_ORDER determines the internal allocation granularity, effectively + * adjusting internal fragmentation. It also determines the number of + * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the + * allocation granularity will be in chunks of size PAGE_SIZE/64, and there + * will be 64 freelists per pool. + */ +#define NCHUNKS_ORDER 6 + +#define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER) +#define CHUNK_SIZE (1 << CHUNK_SHIFT) +#define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT) +#define ZHDR_SIZE_ALIGNED CHUNK_SIZE + +/** + * struct zbud_pool - stores metadata for each zbud pool + * @lock: protects all pool fields and first|last_chunk fields of any + * zbud page in the pool + * @unbuddied: array of lists tracking zbud pages that only contain one buddy; + * the lists each zbud page is added to depends on the size of + * its free region. + * @buddied: list tracking the zbud pages that contain two buddies; + * these zbud pages are full + * @lru: list tracking the zbud pages in LRU order by most recently + * added buddy. + * @pages_nr: number of zbud pages in the pool. + * @ops: pointer to a structure of user defined operations specified at + * pool creation time. + * + * This structure is allocated at pool creation time and maintains metadata + * pertaining to a particular zbud pool. + */ +struct zbud_pool { + spinlock_t lock; + struct list_head unbuddied[NCHUNKS]; + struct list_head buddied; + struct list_head lru; + u64 pages_nr; + struct zbud_ops *ops; +}; + +/* + * struct zbud_header - zbud page metadata occupying the first chunk of each + * zbud page. + * @buddy: links the zbud page into the unbuddied/buddied lists in the pool + * @lru: links the zbud page into the lru list in the pool + * @first_chunks: the size of the first buddy in chunks, 0 if free + * @last_chunks: the size of the last buddy in chunks, 0 if free + */ +struct zbud_header { + struct list_head buddy; + struct list_head lru; + unsigned int first_chunks; + unsigned int last_chunks; + bool under_reclaim; +}; + +/***************** + * Helpers +*****************/ +/* Just to make the code easier to read */ +enum buddy { + FIRST, + LAST +}; + +/* Converts an allocation size in bytes to size in zbud chunks */ +static int size_to_chunks(int size) +{ + return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT; +} + +#define for_each_unbuddied_list(_iter, _begin) \ + for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++) + +/* Initializes the zbud header of a newly allocated zbud page */ +static struct zbud_header *init_zbud_page(struct page *page) +{ + struct zbud_header *zhdr = page_address(page); + zhdr->first_chunks = 0; + zhdr->last_chunks = 0; + INIT_LIST_HEAD(&zhdr->buddy); + INIT_LIST_HEAD(&zhdr->lru); + zhdr->under_reclaim = 0; + return zhdr; +} + +/* Resets the struct page fields and frees the page */ +static void free_zbud_page(struct zbud_header *zhdr) +{ + __free_page(virt_to_page(zhdr)); +} + +/* + * Encodes the handle of a particular buddy within a zbud page + * Pool lock should be held as this function accesses first|last_chunks + */ +static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud) +{ + unsigned long handle; + + /* + * For now, the encoded handle is actually just the pointer to the data + * but this might not always be the case. A little information hiding. + * Add CHUNK_SIZE to the handle if it is the first allocation to jump + * over the zbud header in the first chunk. + */ + handle = (unsigned long)zhdr; + if (bud == FIRST) + /* skip over zbud header */ + handle += ZHDR_SIZE_ALIGNED; + else /* bud == LAST */ + handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT); + return handle; +} + +/* Returns the zbud page where a given handle is stored */ +static struct zbud_header *handle_to_zbud_header(unsigned long handle) +{ + return (struct zbud_header *)(handle & PAGE_MASK); +} + +/* Returns the number of free chunks in a zbud page */ +static int num_free_chunks(struct zbud_header *zhdr) +{ + /* + * Rather than branch for different situations, just use the fact that + * free buddies have a length of zero to simplify everything. -1 at the + * end for the zbud header. + */ + return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1; +} + +/***************** + * API Functions +*****************/ +/** + * zbud_create_pool() - create a new zbud pool + * @gfp: gfp flags when allocating the zbud pool structure + * @ops: user-defined operations for the zbud pool + * + * Return: pointer to the new zbud pool or NULL if the metadata allocation + * failed. + */ +struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops) +{ + struct zbud_pool *pool; + int i; + + pool = kmalloc(sizeof(struct zbud_pool), gfp); + if (!pool) + return NULL; + spin_lock_init(&pool->lock); + for_each_unbuddied_list(i, 0) + INIT_LIST_HEAD(&pool->unbuddied[i]); + INIT_LIST_HEAD(&pool->buddied); + INIT_LIST_HEAD(&pool->lru); + pool->pages_nr = 0; + pool->ops = ops; + return pool; +} + +/** + * zbud_destroy_pool() - destroys an existing zbud pool + * @pool: the zbud pool to be destroyed + * + * The pool should be emptied before this function is called. + */ +void zbud_destroy_pool(struct zbud_pool *pool) +{ + kfree(pool); +} + +/** + * zbud_alloc() - allocates a region of a given size + * @pool: zbud pool from which to allocate + * @size: size in bytes of the desired allocation + * @gfp: gfp flags used if the pool needs to grow + * @handle: handle of the new allocation + * + * This function will attempt to find a free region in the pool large enough to + * satisfy the allocation request. A search of the unbuddied lists is + * performed first. If no suitable free region is found, then a new page is + * allocated and added to the pool to satisfy the request. + * + * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used + * as zbud pool pages. + * + * Return: 0 if success and handle is set, otherwise -EINVAL is the size or + * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate + * a new page. + */ +int zbud_alloc(struct zbud_pool *pool, int size, gfp_t gfp, + unsigned long *handle) +{ + int chunks, i, freechunks; + struct zbud_header *zhdr = NULL; + enum buddy bud; + struct page *page; + + if (size <= 0 || gfp & __GFP_HIGHMEM) + return -EINVAL; + if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED) + return -ENOSPC; + chunks = size_to_chunks(size); + spin_lock(&pool->lock); + + /* First, try to find an unbuddied zbud page. */ + zhdr = NULL; + for_each_unbuddied_list(i, chunks) { + if (!list_empty(&pool->unbuddied[i])) { + zhdr = list_first_entry(&pool->unbuddied[i], + struct zbud_header, buddy); + list_del(&zhdr->buddy); + if (zhdr->first_chunks == 0) + bud = FIRST; + else + bud = LAST; + goto found; + } + } + + /* Couldn't find unbuddied zbud page, create new one */ + spin_unlock(&pool->lock); + page = alloc_page(gfp); + if (!page) + return -ENOMEM; + spin_lock(&pool->lock); + pool->pages_nr++; + zhdr = init_zbud_page(page); + bud = FIRST; + +found: + if (bud == FIRST) + zhdr->first_chunks = chunks; + else + zhdr->last_chunks = chunks; + + if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) { + /* Add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } else { + /* Add to buddied list */ + list_add(&zhdr->buddy, &pool->buddied); + } + + /* Add/move zbud page to beginning of LRU */ + if (!list_empty(&zhdr->lru)) + list_del(&zhdr->lru); + list_add(&zhdr->lru, &pool->lru); + + *handle = encode_handle(zhdr, bud); + spin_unlock(&pool->lock); + + return 0; +} + +/** + * zbud_free() - frees the allocation associated with the given handle + * @pool: pool in which the allocation resided + * @handle: handle associated with the allocation returned by zbud_alloc() + * + * In the case that the zbud page in which the allocation resides is under + * reclaim, as indicated by the PG_reclaim flag being set, this function + * only sets the first|last_chunks to 0. The page is actually freed + * once both buddies are evicted (see zbud_reclaim_page() below). + */ +void zbud_free(struct zbud_pool *pool, unsigned long handle) +{ + struct zbud_header *zhdr; + int freechunks; + + spin_lock(&pool->lock); + zhdr = handle_to_zbud_header(handle); + + /* If first buddy, handle will be page aligned */ + if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK) + zhdr->last_chunks = 0; + else + zhdr->first_chunks = 0; + + if (zhdr->under_reclaim) { + /* zbud page is under reclaim, reclaim will free */ + spin_unlock(&pool->lock); + return; + } + + /* Remove from existing buddy list */ + list_del(&zhdr->buddy); + + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { + /* zbud page is empty, free */ + list_del(&zhdr->lru); + free_zbud_page(zhdr); + pool->pages_nr--; + } else { + /* Add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } + + spin_unlock(&pool->lock); +} + +#define list_tail_entry(ptr, type, member) \ + list_entry((ptr)->prev, type, member) + +/** + * zbud_reclaim_page() - evicts allocations from a pool page and frees it + * @pool: pool from which a page will attempt to be evicted + * @retires: number of pages on the LRU list for which eviction will + * be attempted before failing + * + * zbud reclaim is different from normal system reclaim in that the reclaim is + * done from the bottom, up. This is because only the bottom layer, zbud, has + * information on how the allocations are organized within each zbud page. This + * has the potential to create interesting locking situations between zbud and + * the user, however. + * + * To avoid these, this is how zbud_reclaim_page() should be called: + + * The user detects a page should be reclaimed and calls zbud_reclaim_page(). + * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call + * the user-defined eviction handler with the pool and handle as arguments. + * + * If the handle can not be evicted, the eviction handler should return + * non-zero. zbud_reclaim_page() will add the zbud page back to the + * appropriate list and try the next zbud page on the LRU up to + * a user defined number of retries. + * + * If the handle is successfully evicted, the eviction handler should + * return 0 _and_ should have called zbud_free() on the handle. zbud_free() + * contains logic to delay freeing the page if the page is under reclaim, + * as indicated by the setting of the PG_reclaim flag on the underlying page. + * + * If all buddies in the zbud page are successfully evicted, then the + * zbud page can be freed. + * + * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are + * no pages to evict or an eviction handler is not registered, -EAGAIN if + * the retry limit was hit. + */ +int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries) +{ + int i, ret, freechunks; + struct zbud_header *zhdr; + unsigned long first_handle = 0, last_handle = 0; + + spin_lock(&pool->lock); + if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) || + retries == 0) { + spin_unlock(&pool->lock); + return -EINVAL; + } + for (i = 0; i < retries; i++) { + zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru); + list_del(&zhdr->lru); + list_del(&zhdr->buddy); + /* Protect zbud page against free */ + zhdr->under_reclaim = true; + /* + * We need encode the handles before unlocking, since we can + * race with free that will set (first|last)_chunks to 0 + */ + first_handle = 0; + last_handle = 0; + if (zhdr->first_chunks) + first_handle = encode_handle(zhdr, FIRST); + if (zhdr->last_chunks) + last_handle = encode_handle(zhdr, LAST); + spin_unlock(&pool->lock); + + /* Issue the eviction callback(s) */ + if (first_handle) { + ret = pool->ops->evict(pool, first_handle); + if (ret) + goto next; + } + if (last_handle) { + ret = pool->ops->evict(pool, last_handle); + if (ret) + goto next; + } +next: + spin_lock(&pool->lock); + zhdr->under_reclaim = false; + if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) { + /* + * Both buddies are now free, free the zbud page and + * return success. + */ + free_zbud_page(zhdr); + pool->pages_nr--; + spin_unlock(&pool->lock); + return 0; + } else if (zhdr->first_chunks == 0 || + zhdr->last_chunks == 0) { + /* add to unbuddied list */ + freechunks = num_free_chunks(zhdr); + list_add(&zhdr->buddy, &pool->unbuddied[freechunks]); + } else { + /* add to buddied list */ + list_add(&zhdr->buddy, &pool->buddied); + } + + /* add to beginning of LRU */ + list_add(&zhdr->lru, &pool->lru); + } + spin_unlock(&pool->lock); + return -EAGAIN; +} + +/** + * zbud_map() - maps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be mapped + * + * While trivial for zbud, the mapping functions for others allocators + * implementing this allocation API could have more complex information encoded + * in the handle and could create temporary mappings to make the data + * accessible to the user. + * + * Returns: a pointer to the mapped allocation + */ +void *zbud_map(struct zbud_pool *pool, unsigned long handle) +{ + return (void *)(handle); +} + +/** + * zbud_unmap() - maps the allocation associated with the given handle + * @pool: pool in which the allocation resides + * @handle: handle associated with the allocation to be unmapped + */ +void zbud_unmap(struct zbud_pool *pool, unsigned long handle) +{ +} + +/** + * zbud_get_pool_size() - gets the zbud pool size in pages + * @pool: pool whose size is being queried + * + * Returns: size in pages of the given pool. The pool lock need not be + * taken to access pages_nr. + */ +u64 zbud_get_pool_size(struct zbud_pool *pool) +{ + return pool->pages_nr; +} + +static int __init init_zbud(void) +{ + /* Make sure the zbud header will fit in one chunk */ + BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED); + pr_info("loaded\n"); + return 0; +} + +static void __exit exit_zbud(void) +{ + pr_info("unloaded\n"); +} + +module_init(init_zbud); +module_exit(exit_zbud); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>"); +MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages"); diff --git a/mm/zswap.c b/mm/zswap.c new file mode 100644 index 0000000..deda2b6 --- /dev/null +++ b/mm/zswap.c @@ -0,0 +1,943 @@ +/* + * zswap.c - zswap driver file + * + * zswap is a backend for frontswap that takes pages that are in the process + * of being swapped out and attempts to compress and store them in a + * RAM-based memory pool. This can result in a significant I/O reduction on + * the swap device and, in the case where decompressing from RAM is faster + * than reading from the swap device, can also improve workload performance. + * + * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. +*/ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/module.h> +#include <linux/cpu.h> +#include <linux/highmem.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/types.h> +#include <linux/atomic.h> +#include <linux/frontswap.h> +#include <linux/rbtree.h> +#include <linux/swap.h> +#include <linux/crypto.h> +#include <linux/mempool.h> +#include <linux/zbud.h> + +#include <linux/mm_types.h> +#include <linux/page-flags.h> +#include <linux/swapops.h> +#include <linux/writeback.h> +#include <linux/pagemap.h> + +/********************************* +* statistics +**********************************/ +/* Number of memory pages used by the compressed pool */ +static u64 zswap_pool_pages; +/* The number of compressed pages currently stored in zswap */ +static atomic_t zswap_stored_pages = ATOMIC_INIT(0); + +/* + * The statistics below are not protected from concurrent access for + * performance reasons so they may not be a 100% accurate. However, + * they do provide useful information on roughly how many times a + * certain event is occurring. +*/ + +/* Pool limit was hit (see zswap_max_pool_percent) */ +static u64 zswap_pool_limit_hit; +/* Pages written back when pool limit was reached */ +static u64 zswap_written_back_pages; +/* Store failed due to a reclaim failure after pool limit was reached */ +static u64 zswap_reject_reclaim_fail; +/* Compressed page was too big for the allocator to (optimally) store */ +static u64 zswap_reject_compress_poor; +/* Store failed because underlying allocator could not get memory */ +static u64 zswap_reject_alloc_fail; +/* Store failed because the entry metadata could not be allocated (rare) */ +static u64 zswap_reject_kmemcache_fail; +/* Duplicate store was encountered (rare) */ +static u64 zswap_duplicate_entry; + +/********************************* +* tunables +**********************************/ +/* Enable/disable zswap (disabled by default, fixed at boot for now) */ +static bool zswap_enabled __read_mostly; +module_param_named(enabled, zswap_enabled, bool, 0); + +/* Compressor to be used by zswap (fixed at boot for now) */ +#define ZSWAP_COMPRESSOR_DEFAULT "lzo" +static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; +module_param_named(compressor, zswap_compressor, charp, 0); + +/* The maximum percentage of memory that the compressed pool can occupy */ +static unsigned int zswap_max_pool_percent = 20; +module_param_named(max_pool_percent, + zswap_max_pool_percent, uint, 0644); + +/********************************* +* compression functions +**********************************/ +/* per-cpu compression transforms */ +static struct crypto_comp * __percpu *zswap_comp_pcpu_tfms; + +enum comp_op { + ZSWAP_COMPOP_COMPRESS, + ZSWAP_COMPOP_DECOMPRESS +}; + +static int zswap_comp_op(enum comp_op op, const u8 *src, unsigned int slen, + u8 *dst, unsigned int *dlen) +{ + struct crypto_comp *tfm; + int ret; + + tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, get_cpu()); + switch (op) { + case ZSWAP_COMPOP_COMPRESS: + ret = crypto_comp_compress(tfm, src, slen, dst, dlen); + break; + case ZSWAP_COMPOP_DECOMPRESS: + ret = crypto_comp_decompress(tfm, src, slen, dst, dlen); + break; + default: + ret = -EINVAL; + } + + put_cpu(); + return ret; +} + +static int __init zswap_comp_init(void) +{ + if (!crypto_has_comp(zswap_compressor, 0, 0)) { + pr_info("%s compressor not available\n", zswap_compressor); + /* fall back to default compressor */ + zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT; + if (!crypto_has_comp(zswap_compressor, 0, 0)) + /* can't even load the default compressor */ + return -ENODEV; + } + pr_info("using %s compressor\n", zswap_compressor); + + /* alloc percpu transforms */ + zswap_comp_pcpu_tfms = alloc_percpu(struct crypto_comp *); + if (!zswap_comp_pcpu_tfms) + return -ENOMEM; + return 0; +} + +static void zswap_comp_exit(void) +{ + /* free percpu transforms */ + if (zswap_comp_pcpu_tfms) + free_percpu(zswap_comp_pcpu_tfms); +} + +/********************************* +* data structures +**********************************/ +/* + * struct zswap_entry + * + * This structure contains the metadata for tracking a single compressed + * page within zswap. + * + * rbnode - links the entry into red-black tree for the appropriate swap type + * refcount - the number of outstanding reference to the entry. This is needed + * to protect against premature freeing of the entry by code + * concurent calls to load, invalidate, and writeback. The lock + * for the zswap_tree structure that contains the entry must + * be held while changing the refcount. Since the lock must + * be held, there is no reason to also make refcount atomic. + * offset - the swap offset for the entry. Index into the red-black tree. + * handle - zsmalloc allocation handle that stores the compressed page data + * length - the length in bytes of the compressed page data. Needed during + * decompression + */ +struct zswap_entry { + struct rb_node rbnode; + pgoff_t offset; + int refcount; + unsigned int length; + unsigned long handle; +}; + +struct zswap_header { + swp_entry_t swpentry; +}; + +/* + * The tree lock in the zswap_tree struct protects a few things: + * - the rbtree + * - the refcount field of each entry in the tree + */ +struct zswap_tree { + struct rb_root rbroot; + spinlock_t lock; + struct zbud_pool *pool; +}; + +static struct zswap_tree *zswap_trees[MAX_SWAPFILES]; + +/********************************* +* zswap entry functions +**********************************/ +static struct kmem_cache *zswap_entry_cache; + +static int zswap_entry_cache_create(void) +{ + zswap_entry_cache = KMEM_CACHE(zswap_entry, 0); + return (zswap_entry_cache == NULL); +} + +static void zswap_entry_cache_destory(void) +{ + kmem_cache_destroy(zswap_entry_cache); +} + +static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp) +{ + struct zswap_entry *entry; + entry = kmem_cache_alloc(zswap_entry_cache, gfp); + if (!entry) + return NULL; + entry->refcount = 1; + return entry; +} + +static void zswap_entry_cache_free(struct zswap_entry *entry) +{ + kmem_cache_free(zswap_entry_cache, entry); +} + +/* caller must hold the tree lock */ +static void zswap_entry_get(struct zswap_entry *entry) +{ + entry->refcount++; +} + +/* caller must hold the tree lock */ +static int zswap_entry_put(struct zswap_entry *entry) +{ + entry->refcount--; + return entry->refcount; +} + +/********************************* +* rbtree functions +**********************************/ +static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset) +{ + struct rb_node *node = root->rb_node; + struct zswap_entry *entry; + + while (node) { + entry = rb_entry(node, struct zswap_entry, rbnode); + if (entry->offset > offset) + node = node->rb_left; + else if (entry->offset < offset) + node = node->rb_right; + else + return entry; + } + return NULL; +} + +/* + * In the case that a entry with the same offset is found, a pointer to + * the existing entry is stored in dupentry and the function returns -EEXIST + */ +static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry, + struct zswap_entry **dupentry) +{ + struct rb_node **link = &root->rb_node, *parent = NULL; + struct zswap_entry *myentry; + + while (*link) { + parent = *link; + myentry = rb_entry(parent, struct zswap_entry, rbnode); + if (myentry->offset > entry->offset) + link = &(*link)->rb_left; + else if (myentry->offset < entry->offset) + link = &(*link)->rb_right; + else { + *dupentry = myentry; + return -EEXIST; + } + } + rb_link_node(&entry->rbnode, parent, link); + rb_insert_color(&entry->rbnode, root); + return 0; +} + +/********************************* +* per-cpu code +**********************************/ +static DEFINE_PER_CPU(u8 *, zswap_dstmem); + +static int __zswap_cpu_notifier(unsigned long action, unsigned long cpu) +{ + struct crypto_comp *tfm; + u8 *dst; + + switch (action) { + case CPU_UP_PREPARE: + tfm = crypto_alloc_comp(zswap_compressor, 0, 0); + if (IS_ERR(tfm)) { + pr_err("can't allocate compressor transform\n"); + return NOTIFY_BAD; + } + *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = tfm; + dst = kmalloc(PAGE_SIZE * 2, GFP_KERNEL); + if (!dst) { + pr_err("can't allocate compressor buffer\n"); + crypto_free_comp(tfm); + *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; + return NOTIFY_BAD; + } + per_cpu(zswap_dstmem, cpu) = dst; + break; + case CPU_DEAD: + case CPU_UP_CANCELED: + tfm = *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu); + if (tfm) { + crypto_free_comp(tfm); + *per_cpu_ptr(zswap_comp_pcpu_tfms, cpu) = NULL; + } + dst = per_cpu(zswap_dstmem, cpu); + kfree(dst); + per_cpu(zswap_dstmem, cpu) = NULL; + break; + default: + break; + } + return NOTIFY_OK; +} + +static int zswap_cpu_notifier(struct notifier_block *nb, + unsigned long action, void *pcpu) +{ + unsigned long cpu = (unsigned long)pcpu; + return __zswap_cpu_notifier(action, cpu); +} + +static struct notifier_block zswap_cpu_notifier_block = { + .notifier_call = zswap_cpu_notifier +}; + +static int zswap_cpu_init(void) +{ + unsigned long cpu; + + get_online_cpus(); + for_each_online_cpu(cpu) + if (__zswap_cpu_notifier(CPU_UP_PREPARE, cpu) != NOTIFY_OK) + goto cleanup; + register_cpu_notifier(&zswap_cpu_notifier_block); + put_online_cpus(); + return 0; + +cleanup: + for_each_online_cpu(cpu) + __zswap_cpu_notifier(CPU_UP_CANCELED, cpu); + put_online_cpus(); + return -ENOMEM; +} + +/********************************* +* helpers +**********************************/ +static bool zswap_is_full(void) +{ + return (totalram_pages * zswap_max_pool_percent / 100 < + zswap_pool_pages); +} + +/* + * Carries out the common pattern of freeing and entry's zsmalloc allocation, + * freeing the entry itself, and decrementing the number of stored pages. + */ +static void zswap_free_entry(struct zswap_tree *tree, struct zswap_entry *entry) +{ + zbud_free(tree->pool, entry->handle); + zswap_entry_cache_free(entry); + atomic_dec(&zswap_stored_pages); + zswap_pool_pages = zbud_get_pool_size(tree->pool); +} + +/********************************* +* writeback code +**********************************/ +/* return enum for zswap_get_swap_cache_page */ +enum zswap_get_swap_ret { + ZSWAP_SWAPCACHE_NEW, + ZSWAP_SWAPCACHE_EXIST, + ZSWAP_SWAPCACHE_NOMEM +}; + +/* + * zswap_get_swap_cache_page + * + * This is an adaption of read_swap_cache_async() + * + * This function tries to find a page with the given swap entry + * in the swapper_space address space (the swap cache). If the page + * is found, it is returned in retpage. Otherwise, a page is allocated, + * added to the swap cache, and returned in retpage. + * + * If success, the swap cache page is returned in retpage + * Returns 0 if page was already in the swap cache, page is not locked + * Returns 1 if the new page needs to be populated, page is locked + * Returns <0 on error + */ +static int zswap_get_swap_cache_page(swp_entry_t entry, + struct page **retpage) +{ + struct page *found_page, *new_page = NULL; + struct address_space *swapper_space = &swapper_spaces[swp_type(entry)]; + int err; + + *retpage = NULL; + do { + /* + * First check the swap cache. Since this is normally + * called after lookup_swap_cache() failed, re-calling + * that would confuse statistics. + */ + found_page = find_get_page(swapper_space, entry.val); + if (found_page) + break; + + /* + * Get a new page to read into from swap. + */ + if (!new_page) { + new_page = alloc_page(GFP_KERNEL); + if (!new_page) + break; /* Out of memory */ + } + + /* + * call radix_tree_preload() while we can wait. + */ + err = radix_tree_preload(GFP_KERNEL); + if (err) + break; + + /* + * Swap entry may have been freed since our caller observed it. + */ + err = swapcache_prepare(entry); + if (err == -EEXIST) { /* seems racy */ + radix_tree_preload_end(); + continue; + } + if (err) { /* swp entry is obsolete ? */ + radix_tree_preload_end(); + break; + } + + /* May fail (-ENOMEM) if radix-tree node allocation failed. */ + __set_page_locked(new_page); + SetPageSwapBacked(new_page); + err = __add_to_swap_cache(new_page, entry); + if (likely(!err)) { + radix_tree_preload_end(); + lru_cache_add_anon(new_page); + *retpage = new_page; + return ZSWAP_SWAPCACHE_NEW; + } + radix_tree_preload_end(); + ClearPageSwapBacked(new_page); + __clear_page_locked(new_page); + /* + * add_to_swap_cache() doesn't return -EEXIST, so we can safely + * clear SWAP_HAS_CACHE flag. + */ + swapcache_free(entry, NULL); + } while (err != -ENOMEM); + + if (new_page) + page_cache_release(new_page); + if (!found_page) + return ZSWAP_SWAPCACHE_NOMEM; + *retpage = found_page; + return ZSWAP_SWAPCACHE_EXIST; +} + +/* + * Attempts to free an entry by adding a page to the swap cache, + * decompressing the entry data into the page, and issuing a + * bio write to write the page back to the swap device. + * + * This can be thought of as a "resumed writeback" of the page + * to the swap device. We are basically resuming the same swap + * writeback path that was intercepted with the frontswap_store() + * in the first place. After the page has been decompressed into + * the swap cache, the compressed version stored by zswap can be + * freed. + */ +static int zswap_writeback_entry(struct zbud_pool *pool, unsigned long handle) +{ + struct zswap_header *zhdr; + swp_entry_t swpentry; + struct zswap_tree *tree; + pgoff_t offset; + struct zswap_entry *entry; + struct page *page; + u8 *src, *dst; + unsigned int dlen; + int ret, refcount; + struct writeback_control wbc = { + .sync_mode = WB_SYNC_NONE, + }; + + /* extract swpentry from data */ + zhdr = zbud_map(pool, handle); + swpentry = zhdr->swpentry; /* here */ + zbud_unmap(pool, handle); + tree = zswap_trees[swp_type(swpentry)]; + offset = swp_offset(swpentry); + BUG_ON(pool != tree->pool); + + /* find and ref zswap entry */ + spin_lock(&tree->lock); + entry = zswap_rb_search(&tree->rbroot, offset); + if (!entry) { + /* entry was invalidated */ + spin_unlock(&tree->lock); + return 0; + } + zswap_entry_get(entry); + spin_unlock(&tree->lock); + BUG_ON(offset != entry->offset); + + /* try to allocate swap cache page */ + switch (zswap_get_swap_cache_page(swpentry, &page)) { + case ZSWAP_SWAPCACHE_NOMEM: /* no memory */ + ret = -ENOMEM; + goto fail; + + case ZSWAP_SWAPCACHE_EXIST: /* page is unlocked */ + /* page is already in the swap cache, ignore for now */ + page_cache_release(page); + ret = -EEXIST; + goto fail; + + case ZSWAP_SWAPCACHE_NEW: /* page is locked */ + /* decompress */ + dlen = PAGE_SIZE; + src = (u8 *)zbud_map(tree->pool, entry->handle) + + sizeof(struct zswap_header); + dst = kmap_atomic(page); + ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, + entry->length, dst, &dlen); + kunmap_atomic(dst); + zbud_unmap(tree->pool, entry->handle); + BUG_ON(ret); + BUG_ON(dlen != PAGE_SIZE); + + /* page is up to date */ + SetPageUptodate(page); + } + + /* start writeback */ + __swap_writepage(page, &wbc, end_swap_bio_write); + page_cache_release(page); + zswap_written_back_pages++; + + spin_lock(&tree->lock); + + /* drop local reference */ + zswap_entry_put(entry); + /* drop the initial reference from entry creation */ + refcount = zswap_entry_put(entry); + + /* + * There are three possible values for refcount here: + * (1) refcount is 1, load is in progress, unlink from rbtree, + * load will free + * (2) refcount is 0, (normal case) entry is valid, + * remove from rbtree and free entry + * (3) refcount is -1, invalidate happened during writeback, + * free entry + */ + if (refcount >= 0) { + /* no invalidate yet, remove from rbtree */ + rb_erase(&entry->rbnode, &tree->rbroot); + } + spin_unlock(&tree->lock); + if (refcount <= 0) { + /* free the entry */ + zswap_free_entry(tree, entry); + return 0; + } + return -EAGAIN; + +fail: + spin_lock(&tree->lock); + zswap_entry_put(entry); + spin_unlock(&tree->lock); + return ret; +} + +/********************************* +* frontswap hooks +**********************************/ +/* attempts to compress and store an single page */ +static int zswap_frontswap_store(unsigned type, pgoff_t offset, + struct page *page) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct zswap_entry *entry, *dupentry; + int ret; + unsigned int dlen = PAGE_SIZE, len; + unsigned long handle; + char *buf; + u8 *src, *dst; + struct zswap_header *zhdr; + + if (!tree) { + ret = -ENODEV; + goto reject; + } + + /* reclaim space if needed */ + if (zswap_is_full()) { + zswap_pool_limit_hit++; + if (zbud_reclaim_page(tree->pool, 8)) { + zswap_reject_reclaim_fail++; + ret = -ENOMEM; + goto reject; + } + } + + /* allocate entry */ + entry = zswap_entry_cache_alloc(GFP_KERNEL); + if (!entry) { + zswap_reject_kmemcache_fail++; + ret = -ENOMEM; + goto reject; + } + + /* compress */ + dst = get_cpu_var(zswap_dstmem); + src = kmap_atomic(page); + ret = zswap_comp_op(ZSWAP_COMPOP_COMPRESS, src, PAGE_SIZE, dst, &dlen); + kunmap_atomic(src); + if (ret) { + ret = -EINVAL; + goto freepage; + } + + /* store */ + len = dlen + sizeof(struct zswap_header); + ret = zbud_alloc(tree->pool, len, __GFP_NORETRY | __GFP_NOWARN, + &handle); + if (ret == -ENOSPC) { + zswap_reject_compress_poor++; + goto freepage; + } + if (ret) { + zswap_reject_alloc_fail++; + goto freepage; + } + zhdr = zbud_map(tree->pool, handle); + zhdr->swpentry = swp_entry(type, offset); + buf = (u8 *)(zhdr + 1); + memcpy(buf, dst, dlen); + zbud_unmap(tree->pool, handle); + put_cpu_var(zswap_dstmem); + + /* populate entry */ + entry->offset = offset; + entry->handle = handle; + entry->length = dlen; + + /* map */ + spin_lock(&tree->lock); + do { + ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry); + if (ret == -EEXIST) { + zswap_duplicate_entry++; + /* remove from rbtree */ + rb_erase(&dupentry->rbnode, &tree->rbroot); + if (!zswap_entry_put(dupentry)) { + /* free */ + zswap_free_entry(tree, dupentry); + } + } + } while (ret == -EEXIST); + spin_unlock(&tree->lock); + + /* update stats */ + atomic_inc(&zswap_stored_pages); + zswap_pool_pages = zbud_get_pool_size(tree->pool); + + return 0; + +freepage: + put_cpu_var(zswap_dstmem); + zswap_entry_cache_free(entry); +reject: + return ret; +} + +/* + * returns 0 if the page was successfully decompressed + * return -1 on entry not found or error +*/ +static int zswap_frontswap_load(unsigned type, pgoff_t offset, + struct page *page) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct zswap_entry *entry; + u8 *src, *dst; + unsigned int dlen; + int refcount, ret; + + /* find */ + spin_lock(&tree->lock); + entry = zswap_rb_search(&tree->rbroot, offset); + if (!entry) { + /* entry was written back */ + spin_unlock(&tree->lock); + return -1; + } + zswap_entry_get(entry); + spin_unlock(&tree->lock); + + /* decompress */ + dlen = PAGE_SIZE; + src = (u8 *)zbud_map(tree->pool, entry->handle) + + sizeof(struct zswap_header); + dst = kmap_atomic(page); + ret = zswap_comp_op(ZSWAP_COMPOP_DECOMPRESS, src, entry->length, + dst, &dlen); + kunmap_atomic(dst); + zbud_unmap(tree->pool, entry->handle); + BUG_ON(ret); + + spin_lock(&tree->lock); + refcount = zswap_entry_put(entry); + if (likely(refcount)) { + spin_unlock(&tree->lock); + return 0; + } + spin_unlock(&tree->lock); + + /* + * We don't have to unlink from the rbtree because + * zswap_writeback_entry() or zswap_frontswap_invalidate page() + * has already done this for us if we are the last reference. + */ + /* free */ + + zswap_free_entry(tree, entry); + + return 0; +} + +/* frees an entry in zswap */ +static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct zswap_entry *entry; + int refcount; + + /* find */ + spin_lock(&tree->lock); + entry = zswap_rb_search(&tree->rbroot, offset); + if (!entry) { + /* entry was written back */ + spin_unlock(&tree->lock); + return; + } + + /* remove from rbtree */ + rb_erase(&entry->rbnode, &tree->rbroot); + + /* drop the initial reference from entry creation */ + refcount = zswap_entry_put(entry); + + spin_unlock(&tree->lock); + + if (refcount) { + /* writeback in progress, writeback will free */ + return; + } + + /* free */ + zswap_free_entry(tree, entry); +} + +/* frees all zswap entries for the given swap type */ +static void zswap_frontswap_invalidate_area(unsigned type) +{ + struct zswap_tree *tree = zswap_trees[type]; + struct rb_node *node; + struct zswap_entry *entry; + + if (!tree) + return; + + /* walk the tree and free everything */ + spin_lock(&tree->lock); + /* + * TODO: Even though this code should not be executed because + * the try_to_unuse() in swapoff should have emptied the tree, + * it is very wasteful to rebalance the tree after every + * removal when we are freeing the whole tree. + * + * If post-order traversal code is ever added to the rbtree + * implementation, it should be used here. + */ + while ((node = rb_first(&tree->rbroot))) { + entry = rb_entry(node, struct zswap_entry, rbnode); + rb_erase(&entry->rbnode, &tree->rbroot); + zbud_free(tree->pool, entry->handle); + zswap_entry_cache_free(entry); + atomic_dec(&zswap_stored_pages); + } + tree->rbroot = RB_ROOT; + spin_unlock(&tree->lock); +} + +static struct zbud_ops zswap_zbud_ops = { + .evict = zswap_writeback_entry +}; + +static void zswap_frontswap_init(unsigned type) +{ + struct zswap_tree *tree; + + tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL); + if (!tree) + goto err; + tree->pool = zbud_create_pool(GFP_KERNEL, &zswap_zbud_ops); + if (!tree->pool) + goto freetree; + tree->rbroot = RB_ROOT; + spin_lock_init(&tree->lock); + zswap_trees[type] = tree; + return; + +freetree: + kfree(tree); +err: + pr_err("alloc failed, zswap disabled for swap type %d\n", type); +} + +static struct frontswap_ops zswap_frontswap_ops = { + .store = zswap_frontswap_store, + .load = zswap_frontswap_load, + .invalidate_page = zswap_frontswap_invalidate_page, + .invalidate_area = zswap_frontswap_invalidate_area, + .init = zswap_frontswap_init +}; + +/********************************* +* debugfs functions +**********************************/ +#ifdef CONFIG_DEBUG_FS +#include <linux/debugfs.h> + +static struct dentry *zswap_debugfs_root; + +static int __init zswap_debugfs_init(void) +{ + if (!debugfs_initialized()) + return -ENODEV; + + zswap_debugfs_root = debugfs_create_dir("zswap", NULL); + if (!zswap_debugfs_root) + return -ENOMEM; + + debugfs_create_u64("pool_limit_hit", S_IRUGO, + zswap_debugfs_root, &zswap_pool_limit_hit); + debugfs_create_u64("reject_reclaim_fail", S_IRUGO, + zswap_debugfs_root, &zswap_reject_reclaim_fail); + debugfs_create_u64("reject_alloc_fail", S_IRUGO, + zswap_debugfs_root, &zswap_reject_alloc_fail); + debugfs_create_u64("reject_kmemcache_fail", S_IRUGO, + zswap_debugfs_root, &zswap_reject_kmemcache_fail); + debugfs_create_u64("reject_compress_poor", S_IRUGO, + zswap_debugfs_root, &zswap_reject_compress_poor); + debugfs_create_u64("written_back_pages", S_IRUGO, + zswap_debugfs_root, &zswap_written_back_pages); + debugfs_create_u64("duplicate_entry", S_IRUGO, + zswap_debugfs_root, &zswap_duplicate_entry); + debugfs_create_u64("pool_pages", S_IRUGO, + zswap_debugfs_root, &zswap_pool_pages); + debugfs_create_atomic_t("stored_pages", S_IRUGO, + zswap_debugfs_root, &zswap_stored_pages); + + return 0; +} + +static void __exit zswap_debugfs_exit(void) +{ + debugfs_remove_recursive(zswap_debugfs_root); +} +#else +static int __init zswap_debugfs_init(void) +{ + return 0; +} + +static void __exit zswap_debugfs_exit(void) { } +#endif + +/********************************* +* module init and exit +**********************************/ +static int __init init_zswap(void) +{ + if (!zswap_enabled) + return 0; + + pr_info("loading zswap\n"); + if (zswap_entry_cache_create()) { + pr_err("entry cache creation failed\n"); + goto error; + } + if (zswap_comp_init()) { + pr_err("compressor initialization failed\n"); + goto compfail; + } + if (zswap_cpu_init()) { + pr_err("per-cpu initialization failed\n"); + goto pcpufail; + } + frontswap_register_ops(&zswap_frontswap_ops); + if (zswap_debugfs_init()) + pr_warn("debugfs initialization failed\n"); + return 0; +pcpufail: + zswap_comp_exit(); +compfail: + zswap_entry_cache_destory(); +error: + return -ENOMEM; +} +/* must be late so crypto has time to come up */ +late_initcall(init_zswap); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>"); +MODULE_DESCRIPTION("Compressed cache for swap pages"); |