diff options
author | Kirill A. Shutemov <kirill.shutemov@linux.intel.com> | 2015-02-11 15:26:50 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2015-02-11 17:06:04 -0800 |
commit | dc6c9a35b66b520cf67e05d8ca60ebecad3b0479 (patch) | |
tree | 41075776145d02727c15c27d522b4c93529cca77 /fs/proc | |
parent | 8aa76875dc15b2dd21fa74eb7c12dc3c75f4b6b6 (diff) | |
download | op-kernel-dev-dc6c9a35b66b520cf67e05d8ca60ebecad3b0479.zip op-kernel-dev-dc6c9a35b66b520cf67e05d8ca60ebecad3b0479.tar.gz |
mm: account pmd page tables to the process
Dave noticed that unprivileged process can allocate significant amount of
memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and
memory cgroup. The trick is to allocate a lot of PMD page tables. Linux
kernel doesn't account PMD tables to the process, only PTE.
The use-cases below use few tricks to allocate a lot of PMD page tables
while keeping VmRSS and VmPTE low. oom_score for the process will be 0.
#include <errno.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <sys/mman.h>
#include <sys/prctl.h>
#define PUD_SIZE (1UL << 30)
#define PMD_SIZE (1UL << 21)
#define NR_PUD 130000
int main(void)
{
char *addr = NULL;
unsigned long i;
prctl(PR_SET_THP_DISABLE);
for (i = 0; i < NR_PUD ; i++) {
addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ,
MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
if (addr == MAP_FAILED) {
perror("mmap");
break;
}
*addr = 'x';
munmap(addr, PMD_SIZE);
mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ,
MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
if (addr == MAP_FAILED)
perror("re-mmap"), exit(1);
}
printf("PID %d consumed %lu KiB in PMD page tables\n",
getpid(), i * 4096 >> 10);
return pause();
}
The patch addresses the issue by account PMD tables to the process the
same way we account PTE.
The main place where PMD tables is accounted is __pmd_alloc() and
free_pmd_range(). But there're few corner cases:
- HugeTLB can share PMD page tables. The patch handles by accounting
the table to all processes who share it.
- x86 PAE pre-allocates few PMD tables on fork.
- Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity
check on exit(2).
Accounting only happens on configuration where PMD page table's level is
present (PMD is not folded). As with nr_ptes we use per-mm counter. The
counter value is used to calculate baseline for badness score by
oom-killer.
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Reported-by: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Hugh Dickins <hughd@google.com>
Reviewed-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: David Rientjes <rientjes@google.com>
Tested-by: Sedat Dilek <sedat.dilek@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'fs/proc')
-rw-r--r-- | fs/proc/task_mmu.c | 9 |
1 files changed, 6 insertions, 3 deletions
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6396f88..e6e0abe 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -21,7 +21,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long data, text, lib, swap; + unsigned long data, text, lib, swap, ptes, pmds; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; /* @@ -42,6 +42,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; swap = get_mm_counter(mm, MM_SWAPENTS); + ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes); + pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); seq_printf(m, "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" @@ -54,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmExe:\t%8lu kB\n" "VmLib:\t%8lu kB\n" "VmPTE:\t%8lu kB\n" + "VmPMD:\t%8lu kB\n" "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), total_vm << (PAGE_SHIFT-10), @@ -63,8 +66,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) total_rss << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, - (PTRS_PER_PTE * sizeof(pte_t) * - atomic_long_read(&mm->nr_ptes)) >> 10, + ptes >> 10, + pmds >> 10, swap << (PAGE_SHIFT-10)); } |