diff options
author | Johannes Weiner <hannes@cmpxchg.org> | 2014-08-08 14:19:20 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2014-08-08 15:57:17 -0700 |
commit | 00501b531c4723972aa11d6d4ebcf8d6552007c8 (patch) | |
tree | b3ad4850d58f137cf87b8424412d962fb251839f /mm/huge_memory.c | |
parent | 4449a51a7c281602d3a385044ab928322a122a02 (diff) | |
download | op-kernel-dev-00501b531c4723972aa11d6d4ebcf8d6552007c8.zip op-kernel-dev-00501b531c4723972aa11d6d4ebcf8d6552007c8.tar.gz |
mm: memcontrol: rewrite charge API
These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages. This drastically simplifies the code and
reduces charging and uncharging overhead. The most expensive part of
charging and uncharging is the page_cgroup bit spinlock, which is removed
entirely after this series.
Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup (i.e.
executing in the root memcg). Before:
15.36% cat [kernel.kallsyms] [k] copy_user_generic_string
13.31% cat [kernel.kallsyms] [k] memset
11.48% cat [kernel.kallsyms] [k] do_mpage_readpage
4.23% cat [kernel.kallsyms] [k] get_page_from_freelist
2.38% cat [kernel.kallsyms] [k] put_page
2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge
2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common
1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
After:
15.67% cat [kernel.kallsyms] [k] copy_user_generic_string
13.48% cat [kernel.kallsyms] [k] memset
11.42% cat [kernel.kallsyms] [k] do_mpage_readpage
3.98% cat [kernel.kallsyms] [k] get_page_from_freelist
2.46% cat [kernel.kallsyms] [k] put_page
2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list
1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup
1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn
1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk
1.30% cat [kernel.kallsyms] [k] kfree
As you can see, the memcg footprint has shrunk quite a bit.
text data bss dec hex filename
37970 9892 400 48262 bc86 mm/memcontrol.o.old
35239 9892 400 45531 b1db mm/memcontrol.o
This patch (of 4):
The memcg charge API charges pages before they are rmapped - i.e. have an
actual "type" - and so every callsite needs its own set of charge and
uncharge functions to know what type is being operated on. Worse,
uncharge has to happen from a context that is still type-specific, rather
than at the end of the page's lifetime with exclusive access, and so
requires a lot of synchronization.
Rewrite the charge API to provide a generic set of try_charge(),
commit_charge() and cancel_charge() transaction operations, much like
what's currently done for swap-in:
mem_cgroup_try_charge() attempts to reserve a charge, reclaiming
pages from the memcg if necessary.
mem_cgroup_commit_charge() commits the page to the charge once it
has a valid page->mapping and PageAnon() reliably tells the type.
mem_cgroup_cancel_charge() aborts the transaction.
This reduces the charge API and enables subsequent patches to
drastically simplify uncharging.
As pages need to be committed after rmap is established but before they
are added to the LRU, page_add_new_anon_rmap() must stop doing LRU
additions again. Revive lru_cache_add_active_or_unevictable().
[hughd@google.com: fix shmem_unuse]
[hughd@google.com: Add comments on the private use of -EAGAIN]
Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Acked-by: Michal Hocko <mhocko@suse.cz>
Cc: Tejun Heo <tj@kernel.org>
Cc: Vladimir Davydov <vdavydov@parallels.com>
Signed-off-by: Hugh Dickins <hughd@google.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r-- | mm/huge_memory.c | 57 |
1 files changed, 37 insertions, 20 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 3630d57..d9a21d06 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -715,13 +715,20 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, unsigned long haddr, pmd_t *pmd, struct page *page) { + struct mem_cgroup *memcg; pgtable_t pgtable; spinlock_t *ptl; VM_BUG_ON_PAGE(!PageCompound(page), page); + + if (mem_cgroup_try_charge(page, mm, GFP_TRANSHUGE, &memcg)) + return VM_FAULT_OOM; + pgtable = pte_alloc_one(mm, haddr); - if (unlikely(!pgtable)) + if (unlikely(!pgtable)) { + mem_cgroup_cancel_charge(page, memcg); return VM_FAULT_OOM; + } clear_huge_page(page, haddr, HPAGE_PMD_NR); /* @@ -734,7 +741,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, ptl = pmd_lock(mm, pmd); if (unlikely(!pmd_none(*pmd))) { spin_unlock(ptl); - mem_cgroup_uncharge_page(page); + mem_cgroup_cancel_charge(page, memcg); put_page(page); pte_free(mm, pgtable); } else { @@ -742,6 +749,8 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, entry = mk_huge_pmd(page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); page_add_new_anon_rmap(page, vma, haddr); + mem_cgroup_commit_charge(page, memcg, false); + lru_cache_add_active_or_unevictable(page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); add_mm_counter(mm, MM_ANONPAGES, HPAGE_PMD_NR); @@ -827,13 +836,7 @@ int do_huge_pmd_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma, count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - if (unlikely(mem_cgroup_charge_anon(page, mm, GFP_TRANSHUGE))) { - put_page(page); - count_vm_event(THP_FAULT_FALLBACK); - return VM_FAULT_FALLBACK; - } if (unlikely(__do_huge_pmd_anonymous_page(mm, vma, haddr, pmd, page))) { - mem_cgroup_uncharge_page(page); put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -979,6 +982,7 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, struct page *page, unsigned long haddr) { + struct mem_cgroup *memcg; spinlock_t *ptl; pgtable_t pgtable; pmd_t _pmd; @@ -999,20 +1003,21 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, __GFP_OTHER_NODE, vma, address, page_to_nid(page)); if (unlikely(!pages[i] || - mem_cgroup_charge_anon(pages[i], mm, - GFP_KERNEL))) { + mem_cgroup_try_charge(pages[i], mm, GFP_KERNEL, + &memcg))) { if (pages[i]) put_page(pages[i]); - mem_cgroup_uncharge_start(); while (--i >= 0) { - mem_cgroup_uncharge_page(pages[i]); + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); + mem_cgroup_cancel_charge(pages[i], memcg); put_page(pages[i]); } - mem_cgroup_uncharge_end(); kfree(pages); ret |= VM_FAULT_OOM; goto out; } + set_page_private(pages[i], (unsigned long)memcg); } for (i = 0; i < HPAGE_PMD_NR; i++) { @@ -1041,7 +1046,11 @@ static int do_huge_pmd_wp_page_fallback(struct mm_struct *mm, pte_t *pte, entry; entry = mk_pte(pages[i], vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); page_add_new_anon_rmap(pages[i], vma, haddr); + mem_cgroup_commit_charge(pages[i], memcg, false); + lru_cache_add_active_or_unevictable(pages[i], vma); pte = pte_offset_map(&_pmd, haddr); VM_BUG_ON(!pte_none(*pte)); set_pte_at(mm, haddr, pte, entry); @@ -1065,12 +1074,12 @@ out: out_free_pages: spin_unlock(ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); - mem_cgroup_uncharge_start(); for (i = 0; i < HPAGE_PMD_NR; i++) { - mem_cgroup_uncharge_page(pages[i]); + memcg = (void *)page_private(pages[i]); + set_page_private(pages[i], 0); + mem_cgroup_cancel_charge(pages[i], memcg); put_page(pages[i]); } - mem_cgroup_uncharge_end(); kfree(pages); goto out; } @@ -1081,6 +1090,7 @@ int do_huge_pmd_wp_page(struct mm_struct *mm, struct vm_area_struct *vma, spinlock_t *ptl; int ret = 0; struct page *page = NULL, *new_page; + struct mem_cgroup *memcg; unsigned long haddr; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -1132,7 +1142,8 @@ alloc: goto out; } - if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE))) { + if (unlikely(mem_cgroup_try_charge(new_page, mm, + GFP_TRANSHUGE, &memcg))) { put_page(new_page); if (page) { split_huge_page(page); @@ -1161,7 +1172,7 @@ alloc: put_user_huge_page(page); if (unlikely(!pmd_same(*pmd, orig_pmd))) { spin_unlock(ptl); - mem_cgroup_uncharge_page(new_page); + mem_cgroup_cancel_charge(new_page, memcg); put_page(new_page); goto out_mn; } else { @@ -1170,6 +1181,8 @@ alloc: entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); pmdp_clear_flush(vma, haddr, pmd); page_add_new_anon_rmap(new_page, vma, haddr); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); set_pmd_at(mm, haddr, pmd, entry); update_mmu_cache_pmd(vma, address, pmd); if (!page) { @@ -2413,6 +2426,7 @@ static void collapse_huge_page(struct mm_struct *mm, spinlock_t *pmd_ptl, *pte_ptl; int isolated; unsigned long hstart, hend; + struct mem_cgroup *memcg; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ @@ -2423,7 +2437,8 @@ static void collapse_huge_page(struct mm_struct *mm, if (!new_page) return; - if (unlikely(mem_cgroup_charge_anon(new_page, mm, GFP_TRANSHUGE))) + if (unlikely(mem_cgroup_try_charge(new_page, mm, + GFP_TRANSHUGE, &memcg))) return; /* @@ -2510,6 +2525,8 @@ static void collapse_huge_page(struct mm_struct *mm, spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); page_add_new_anon_rmap(new_page, vma, address); + mem_cgroup_commit_charge(new_page, memcg, false); + lru_cache_add_active_or_unevictable(new_page, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); @@ -2523,7 +2540,7 @@ out_up_write: return; out: - mem_cgroup_uncharge_page(new_page); + mem_cgroup_cancel_charge(new_page, memcg); goto out_up_write; } |