From 95156f0051cba60ec674bbaa5cf7dc74a74c5612 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 12 Jan 2009 13:02:11 +0100 Subject: lockdep, mm: fix might_fault() annotation Some code (nfs/sunrpc) uses socket ops on kernel memory while holding the mmap_sem, this is safe because kernel memory doesn't get paged out, therefore we'll never actually fault, and the might_fault() annotations will generate false positives. Reported-by: "J. Bruce Fields" Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- mm/memory.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index e009ce8..c2d4c47 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3165,6 +3165,15 @@ void print_vma_addr(char *prefix, unsigned long ip) #ifdef CONFIG_PROVE_LOCKING void might_fault(void) { + /* + * Some code (nfs/sunrpc) uses socket ops on kernel memory while + * holding the mmap_sem, this is safe because kernel memory doesn't + * get paged out, therefore we'll never actually fault, and the + * below annotations will generate false positives. + */ + if (segment_eq(get_fs(), KERNEL_DS)) + return; + might_sleep(); /* * it would be nicer only to annotate paths which are not under -- cgit v1.1 From a36706131182f5507d1e2cfbf391b0fa8d72203c Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 9 Jan 2009 16:13:09 -0800 Subject: x86 PAT: remove PFNMAP type on track_pfn_vma_new() error Impact: fix (harmless) double-free of memtype entries and avoid warning On track_pfn_vma_new() failure, reset the vm_flags so that there will be no second cleanup happening when upper level routines call unmap_vmas(). This patch fixes part of the bug reported here: http://marc.info/?l=linux-kernel&m=123108883716357&w=2 Specifically the error message: X:5010 freeing invalid memtype d0000000-d0101000 Is due to multiple frees on error path, will not happen with the patch below. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- mm/memory.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index c2d4c47..d3ee2ea 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1672,8 +1672,14 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; err = track_pfn_vma_new(vma, prot, pfn, PAGE_ALIGN(size)); - if (err) + if (err) { + /* + * To indicate that track_pfn related cleanup is not + * needed from higher level routine calling unmap_vmas + */ + vma->vm_flags &= ~(VM_IO | VM_RESERVED | VM_PFNMAP); return -EINVAL; + } BUG_ON(addr >= end); pfn -= addr >> PAGE_SHIFT; -- cgit v1.1 From e4b866ed197cef9989348e0479fed8d864ea465b Mon Sep 17 00:00:00 2001 From: "venkatesh.pallipadi@intel.com" Date: Fri, 9 Jan 2009 16:13:11 -0800 Subject: x86 PAT: change track_pfn_vma_new to take pgprot_t pointer param Impact: cleanup Change the protection parameter for track_pfn_vma_new() into a pgprot_t pointer. Subsequent patch changes the x86 PAT handling to return a compatible memtype in pgprot_t, if what was requested cannot be allowed due to conflicts. No fuctionality change in this patch. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Suresh Siddha Signed-off-by: Ingo Molnar --- mm/memory.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/memory.c b/mm/memory.c index d3ee2ea..22bfa7a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1511,6 +1511,7 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn) { int ret; + pgprot_t pgprot = vma->vm_page_prot; /* * Technically, architectures with pte_special can avoid all these * restrictions (same for remap_pfn_range). However we would like @@ -1525,10 +1526,10 @@ int vm_insert_pfn(struct vm_area_struct *vma, unsigned long addr, if (addr < vma->vm_start || addr >= vma->vm_end) return -EFAULT; - if (track_pfn_vma_new(vma, vma->vm_page_prot, pfn, PAGE_SIZE)) + if (track_pfn_vma_new(vma, &pgprot, pfn, PAGE_SIZE)) return -EINVAL; - ret = insert_pfn(vma, addr, pfn, vma->vm_page_prot); + ret = insert_pfn(vma, addr, pfn, pgprot); if (ret) untrack_pfn_vma(vma, pfn, PAGE_SIZE); @@ -1671,7 +1672,7 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr, vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP; - err = track_pfn_vma_new(vma, prot, pfn, PAGE_ALIGN(size)); + err = track_pfn_vma_new(vma, &prot, pfn, PAGE_ALIGN(size)); if (err) { /* * To indicate that track_pfn related cleanup is not -- cgit v1.1 From 2ed7c03ec17779afb4fcfa3b8c61df61bd4879ba Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:13:54 +0100 Subject: [CVE-2009-0029] Convert all system calls to return a long Convert all system calls to return a long. This should be a NOP since all converted types should have the same size anyway. With the exception of sys_exit_group which returned void. But that doesn't matter since the system call doesn't return. Signed-off-by: Heiko Carstens --- mm/filemap.c | 2 +- mm/mmap.c | 2 +- mm/mremap.c | 2 +- mm/nommu.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/filemap.c b/mm/filemap.c index ceba0bd..538b75e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1374,7 +1374,7 @@ do_readahead(struct address_space *mapping, struct file *filp, return 0; } -asmlinkage ssize_t sys_readahead(int fd, loff_t offset, size_t count) +asmlinkage long sys_readahead(int fd, loff_t offset, size_t count) { ssize_t ret; struct file *file; diff --git a/mm/mmap.c b/mm/mmap.c index 74962319..a970d89 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -245,7 +245,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) return next; } -asmlinkage unsigned long sys_brk(unsigned long brk) +asmlinkage long sys_brk(unsigned long brk) { unsigned long rlim, retval; unsigned long newbrk, oldbrk; diff --git a/mm/mremap.c b/mm/mremap.c index 646de95..5572e08 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -420,7 +420,7 @@ out_nc: return ret; } -asmlinkage unsigned long sys_mremap(unsigned long addr, +asmlinkage long sys_mremap(unsigned long addr, unsigned long old_len, unsigned long new_len, unsigned long flags, unsigned long new_addr) { diff --git a/mm/nommu.c b/mm/nommu.c index 60ed837..ee3e789 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -416,7 +416,7 @@ EXPORT_SYMBOL(vm_insert_page); * to a regular file. in this case, the unmapping will need * to invoke file system routines that need the global lock. */ -asmlinkage unsigned long sys_brk(unsigned long brk) +asmlinkage long sys_brk(unsigned long brk) { struct mm_struct *mm = current->mm; -- cgit v1.1 From 6673e0c3fbeaed2cd08e2fd4a4aa97382d6fedb0 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:02 +0100 Subject: [CVE-2009-0029] System call wrapper special cases System calls with an unsigned long long argument can't be converted with the standard wrappers since that would include a cast to long, which in turn means that we would lose the upper 32 bit on 32 bit architectures. Also semctl can't use the standard wrapper since it has a 'union' parameter. So we handle them as special case and add some extra wrappers instead. Signed-off-by: Heiko Carstens --- mm/fadvise.c | 18 ++++++++++++++++-- mm/filemap.c | 9 ++++++++- 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'mm') diff --git a/mm/fadvise.c b/mm/fadvise.c index a1da969..54a0f80 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -24,7 +24,7 @@ * POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could * deactivate the pages and clear PG_Referenced. */ -asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice) +SYSCALL_DEFINE(fadvise64_64)(int fd, loff_t offset, loff_t len, int advice) { struct file *file = fget(fd); struct address_space *mapping; @@ -126,12 +126,26 @@ out: fput(file); return ret; } +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_fadvise64_64(long fd, loff_t offset, loff_t len, long advice) +{ + return SYSC_fadvise64_64((int) fd, offset, len, (int) advice); +} +SYSCALL_ALIAS(sys_fadvise64_64, SyS_fadvise64_64); +#endif #ifdef __ARCH_WANT_SYS_FADVISE64 -asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice) +SYSCALL_DEFINE(fadvise64)(int fd, loff_t offset, size_t len, int advice) { return sys_fadvise64_64(fd, offset, len, advice); } +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_fadvise64(long fd, loff_t offset, long len, long advice) +{ + return SYSC_fadvise64((int) fd, offset, (size_t)len, (int)advice); +} +SYSCALL_ALIAS(sys_fadvise64, SyS_fadvise64); +#endif #endif diff --git a/mm/filemap.c b/mm/filemap.c index 538b75e..23acefe 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1374,7 +1374,7 @@ do_readahead(struct address_space *mapping, struct file *filp, return 0; } -asmlinkage long sys_readahead(int fd, loff_t offset, size_t count) +SYSCALL_DEFINE(readahead)(int fd, loff_t offset, size_t count) { ssize_t ret; struct file *file; @@ -1393,6 +1393,13 @@ asmlinkage long sys_readahead(int fd, loff_t offset, size_t count) } return ret; } +#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS +asmlinkage long SyS_readahead(long fd, loff_t offset, long count) +{ + return SYSC_readahead((int) fd, offset, (size_t) count); +} +SYSCALL_ALIAS(sys_readahead, SyS_readahead); +#endif #ifdef CONFIG_MMU /** -- cgit v1.1 From 6a6160a7b5c27b3c38651baef92a14fa7072b3c1 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:15 +0100 Subject: [CVE-2009-0029] System call wrappers part 13 Signed-off-by: Heiko Carstens --- mm/fremap.c | 4 ++-- mm/mlock.c | 4 ++-- mm/mmap.c | 4 ++-- mm/mprotect.c | 4 ++-- mm/mremap.c | 6 +++--- mm/msync.c | 2 +- mm/nommu.c | 11 +++++------ 7 files changed, 17 insertions(+), 18 deletions(-) (limited to 'mm') diff --git a/mm/fremap.c b/mm/fremap.c index 62d5bbd..736ba7f 100644 --- a/mm/fremap.c +++ b/mm/fremap.c @@ -120,8 +120,8 @@ static int populate_range(struct mm_struct *mm, struct vm_area_struct *vma, * and the vma's default protection is used. Arbitrary protections * might be implemented in the future. */ -asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, - unsigned long prot, unsigned long pgoff, unsigned long flags) +SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, + unsigned long, prot, unsigned long, pgoff, unsigned long, flags) { struct mm_struct *mm = current->mm; struct address_space *mapping; diff --git a/mm/mlock.c b/mm/mlock.c index e125156..04d5e74 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -530,7 +530,7 @@ static int do_mlock(unsigned long start, size_t len, int on) return error; } -asmlinkage long sys_mlock(unsigned long start, size_t len) +SYSCALL_DEFINE2(mlock, unsigned long, start, size_t, len) { unsigned long locked; unsigned long lock_limit; @@ -558,7 +558,7 @@ asmlinkage long sys_mlock(unsigned long start, size_t len) return error; } -asmlinkage long sys_munlock(unsigned long start, size_t len) +SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) { int ret; diff --git a/mm/mmap.c b/mm/mmap.c index a970d89..8d95902 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -245,7 +245,7 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) return next; } -asmlinkage long sys_brk(unsigned long brk) +SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long rlim, retval; unsigned long newbrk, oldbrk; @@ -1948,7 +1948,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len) EXPORT_SYMBOL(do_munmap); -asmlinkage long sys_munmap(unsigned long addr, size_t len) +SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { int ret; struct mm_struct *mm = current->mm; diff --git a/mm/mprotect.c b/mm/mprotect.c index d0f6e7c..abe2694 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -217,8 +217,8 @@ fail: return error; } -asmlinkage long -sys_mprotect(unsigned long start, size_t len, unsigned long prot) +SYSCALL_DEFINE3(mprotect, unsigned long, start, size_t, len, + unsigned long, prot) { unsigned long vm_flags, nstart, end, tmp, reqprot; struct vm_area_struct *vma, *prev; diff --git a/mm/mremap.c b/mm/mremap.c index 5572e08..a39b7b9 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -420,9 +420,9 @@ out_nc: return ret; } -asmlinkage long sys_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr) +SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, + unsigned long, new_len, unsigned long, flags, + unsigned long, new_addr) { unsigned long ret; diff --git a/mm/msync.c b/mm/msync.c index 07dae08..4083209 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -28,7 +28,7 @@ * So by _not_ starting I/O in MS_ASYNC we provide complete flexibility to * applications. */ -asmlinkage long sys_msync(unsigned long start, size_t len, int flags) +SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) { unsigned long end; struct mm_struct *mm = current->mm; diff --git a/mm/nommu.c b/mm/nommu.c index ee3e789..8cee8c8 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -416,7 +416,7 @@ EXPORT_SYMBOL(vm_insert_page); * to a regular file. in this case, the unmapping will need * to invoke file system routines that need the global lock. */ -asmlinkage long sys_brk(unsigned long brk) +SYSCALL_DEFINE1(brk, unsigned long, brk) { struct mm_struct *mm = current->mm; @@ -1573,7 +1573,7 @@ erase_whole_vma: } EXPORT_SYMBOL(do_munmap); -asmlinkage long sys_munmap(unsigned long addr, size_t len) +SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) { int ret; struct mm_struct *mm = current->mm; @@ -1657,10 +1657,9 @@ unsigned long do_mremap(unsigned long addr, } EXPORT_SYMBOL(do_mremap); -asmlinkage -unsigned long sys_mremap(unsigned long addr, - unsigned long old_len, unsigned long new_len, - unsigned long flags, unsigned long new_addr) +SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, + unsigned long, new_len, unsigned long, flags, + unsigned long, new_addr) { unsigned long ret; -- cgit v1.1 From 3480b25743cb7404928d57efeaa3d085708b04c2 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:16 +0100 Subject: [CVE-2009-0029] System call wrappers part 14 Signed-off-by: Heiko Carstens --- mm/madvise.c | 2 +- mm/mincore.c | 4 ++-- mm/mlock.c | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/madvise.c b/mm/madvise.c index f9349c1..b9ce574 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -281,7 +281,7 @@ madvise_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, * -EBADF - map exists, but area maps something that isn't a file. * -EAGAIN - a kernel resource was temporarily unavailable. */ -asmlinkage long sys_madvise(unsigned long start, size_t len_in, int behavior) +SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { unsigned long end, tmp; struct vm_area_struct * vma, *prev; diff --git a/mm/mincore.c b/mm/mincore.c index 5178800..8cb508f 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -177,8 +177,8 @@ none_mapped: * mapped * -EAGAIN - A kernel resource was temporarily unavailable. */ -asmlinkage long sys_mincore(unsigned long start, size_t len, - unsigned char __user * vec) +SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len, + unsigned char __user *, vec) { long retval; unsigned long pages; diff --git a/mm/mlock.c b/mm/mlock.c index 04d5e74..2904a34 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -595,7 +595,7 @@ out: return 0; } -asmlinkage long sys_mlockall(int flags) +SYSCALL_DEFINE1(mlockall, int, flags) { unsigned long lock_limit; int ret = -EINVAL; @@ -623,7 +623,7 @@ out: return ret; } -asmlinkage long sys_munlockall(void) +SYSCALL_DEFINE0(munlockall) { int ret; -- cgit v1.1 From c4ea37c26a691ad0b7e86aa5884aab27830e95c9 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:28 +0100 Subject: [CVE-2009-0029] System call wrappers part 26 Signed-off-by: Heiko Carstens --- mm/swapfile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index da422c4..f48b831 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1377,7 +1377,7 @@ out: return ret; } -asmlinkage long sys_swapoff(const char __user * specialfile) +SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) { struct swap_info_struct * p = NULL; unsigned short *swap_map; @@ -1633,7 +1633,7 @@ late_initcall(max_swapfiles_check); * * The swapon system call */ -asmlinkage long sys_swapon(const char __user * specialfile, int swap_flags) +SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) { struct swap_info_struct * p; char *name = NULL; -- cgit v1.1 From 938bb9f5e840eddbf54e4f62f6c5ba9b3ae12c9d Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 14 Jan 2009 14:14:30 +0100 Subject: [CVE-2009-0029] System call wrappers part 28 Signed-off-by: Heiko Carstens --- mm/mempolicy.c | 24 +++++++++++------------- mm/migrate.c | 8 ++++---- 2 files changed, 15 insertions(+), 17 deletions(-) (limited to 'mm') diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e412ffa..3eb4a6f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1068,10 +1068,9 @@ static int copy_nodes_to_user(unsigned long __user *mask, unsigned long maxnode, return copy_to_user(mask, nodes_addr(*nodes), copy) ? -EFAULT : 0; } -asmlinkage long sys_mbind(unsigned long start, unsigned long len, - unsigned long mode, - unsigned long __user *nmask, unsigned long maxnode, - unsigned flags) +SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, + unsigned long, mode, unsigned long __user *, nmask, + unsigned long, maxnode, unsigned, flags) { nodemask_t nodes; int err; @@ -1091,8 +1090,8 @@ asmlinkage long sys_mbind(unsigned long start, unsigned long len, } /* Set the process memory policy */ -asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, - unsigned long maxnode) +SYSCALL_DEFINE3(set_mempolicy, int, mode, unsigned long __user *, nmask, + unsigned long, maxnode) { int err; nodemask_t nodes; @@ -1110,9 +1109,9 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, return do_set_mempolicy(mode, flags, &nodes); } -asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, - const unsigned long __user *old_nodes, - const unsigned long __user *new_nodes) +SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, + const unsigned long __user *, old_nodes, + const unsigned long __user *, new_nodes) { const struct cred *cred = current_cred(), *tcred; struct mm_struct *mm; @@ -1185,10 +1184,9 @@ out: /* Retrieve NUMA policy */ -asmlinkage long sys_get_mempolicy(int __user *policy, - unsigned long __user *nmask, - unsigned long maxnode, - unsigned long addr, unsigned long flags) +SYSCALL_DEFINE5(get_mempolicy, int __user *, policy, + unsigned long __user *, nmask, unsigned long, maxnode, + unsigned long, addr, unsigned long, flags) { int err; int uninitialized_var(pval); diff --git a/mm/migrate.c b/mm/migrate.c index a30ea5f..2bb4e1d 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1055,10 +1055,10 @@ out: * Move a list of pages in the address space of the currently executing * process. */ -asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages, - const void __user * __user *pages, - const int __user *nodes, - int __user *status, int flags) +SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, + const void __user * __user *, pages, + const int __user *, nodes, + int __user *, status, int, flags) { const struct cred *cred = current_cred(), *tcred; struct task_struct *task; -- cgit v1.1 From 822c18f2e38cbc775792ab65ace4f9198678dec9 Mon Sep 17 00:00:00 2001 From: Ivan Kokshaysky Date: Thu, 15 Jan 2009 13:50:48 -0800 Subject: alpha: fix vmalloc breakage On alpha, we have to map some stuff in the VMALLOC space very early in the boot process (to make SRM console callbacks work and so on, see arch/alpha/mm/init.c). For old VM allocator, we just manually placed a vm_struct onto the global vmlist and this worked for ages. Unfortunately, the new allocator isn't aware of this, so it constantly tries to allocate the VM space which is already in use, making vmalloc on alpha defunct. This patch forces KVA to import vmlist entries on init. [akpm@linux-foundation.org: remove unneeded check (per Johannes)] Signed-off-by: Ivan Kokshaysky Cc: Nick Piggin Cc: Johannes Weiner Cc: Richard Henderson Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index c5db9a7..7e00b28 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -984,6 +985,8 @@ EXPORT_SYMBOL(vm_map_ram); void __init vmalloc_init(void) { + struct vmap_area *va; + struct vm_struct *tmp; int i; for_each_possible_cpu(i) { @@ -996,6 +999,14 @@ void __init vmalloc_init(void) vbq->nr_dirty = 0; } + /* Import existing vmlist entries. */ + for (tmp = vmlist; tmp; tmp = tmp->next) { + va = alloc_bootmem(sizeof(struct vmap_area)); + va->flags = tmp->flags | VM_VM_AREA; + va->va_start = (unsigned long)tmp->addr; + va->va_end = va->va_start + tmp->size; + __insert_vmap_area(va); + } vmap_initialized = true; } -- cgit v1.1 From bd112db872c2f69993c86f458467acb4a14da010 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 15 Jan 2009 13:51:11 -0800 Subject: memcg: fix mem_cgroup_get_reclaim_stat_from_page In case of swapin, a new page is added to lru before it is charged, so page->pc->mem_cgroup points to NULL or last mem_cgroup the page was charged before. In the latter case, if the mem_cgroup has already freed by rmdir, the area pointed to by page->pc->mem_cgroup may have invalid data. Actually, I saw general protection fault. general protection fault: 0000 [#1] SMP last sysfs file: /sys/devices/system/cpu/cpu15/cache/index1/shared_cpu_map CPU 4 Modules linked in: ipt_REJECT xt_tcpudp iptable_filter ip_tables x_tables bridge stp ipv6 autofs4 hidp rfcomm l2cap bluetooth sunrpc dm_mirror dm_region_hash dm_log dm_multipath dm_mod rfkill input_polldev sbs sbshc battery ac lp sg ide_cd_mod cdrom button serio_raw acpi_memhotplug parport_pc e1000 rtc_cmos parport rtc_core rtc_lib i2c_i801 i2c_core shpchp pcspkr ata_piix libata megaraid_mbox megaraid_mm sd_mod scsi_mod ext3 jbd ehci_hcd ohci_hcd uhci_hcd [last unloaded: microcode] Pid: 26038, comm: page01 Tainted: G W 2.6.28-rc9-mm1-mmotm-2008-12-22-16-14-f2ab3dea #1 RIP: 0010:[] [] update_page_reclaim_stat+0x2f/0x42 RSP: 0000:ffff8801ee457da8 EFLAGS: 00010002 RAX: 32353438312021c8 RBX: 0000000000000000 RCX: 32353438312021c8 RDX: 0000000000000000 RSI: ffff8800cb0b1000 RDI: ffff8801164d1d28 RBP: ffff880110002cb8 R08: ffff88010f2eae23 R09: 0000000000000001 R10: ffff8800bc514b00 R11: ffff880110002c00 R12: 0000000000000000 R13: ffff88000f484100 R14: 0000000000000003 R15: 00000000001200d2 FS: 00007f8a261726f0(0000) GS:ffff88010f2eaa80(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 00007f8a25d22000 CR3: 00000001ef18c000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process page01 (pid: 26038, threadinfo ffff8801ee456000, task ffff8800b585b960) Stack: ffffe200071ee568 ffff880110001f00 0000000000000000 ffffffff8028ea17 ffff88000f484100 0000000000000000 0000000000000020 00007f8a25d22000 ffff8800bc514b00 ffffffff8028ec34 0000000000000000 0000000000016fd8 Call Trace: [] ? ____pagevec_lru_add+0xc1/0x13c [] ? drain_cpu_pagevecs+0x36/0x89 [] ? swapin_readahead+0x78/0x98 [] ? handle_mm_fault+0x3d9/0x741 [] ? do_page_fault+0x3ce/0x78c [] ? trace_hardirqs_off_thunk+0x3a/0x3c [] ? page_fault+0x1f/0x30 Code: cc 55 48 8d af b8 0d 00 00 48 89 f7 53 89 d3 e8 39 85 02 00 48 63 d3 48 ff 44 d5 10 45 85 e4 74 05 48 ff 44 d5 00 48 85 c0 74 0e <48> ff 44 d0 10 45 85 e4 74 04 48 ff 04 d0 5b 5d 41 5c c3 41 54 RIP [] update_page_reclaim_stat+0x2f/0x42 RSP Signed-off-by: Daisuke Nishimura Acked-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Pavel Emelyanov Cc: Li Zefan Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e2996b8..b665127 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -358,6 +358,10 @@ void mem_cgroup_rotate_lru_list(struct page *page, enum lru_list lru) return; pc = lookup_page_cgroup(page); + /* + * Used bit is set without atomic ops but after smp_wmb(). + * For making pc->mem_cgroup visible, insert smp_rmb() here. + */ smp_rmb(); /* unused page is not rotated. */ if (!PageCgroupUsed(pc)) @@ -374,7 +378,10 @@ void mem_cgroup_add_lru_list(struct page *page, enum lru_list lru) if (mem_cgroup_disabled()) return; pc = lookup_page_cgroup(page); - /* barrier to sync with "charge" */ + /* + * Used bit is set without atomic ops but after smp_wmb(). + * For making pc->mem_cgroup visible, insert smp_rmb() here. + */ smp_rmb(); if (!PageCgroupUsed(pc)) return; @@ -559,6 +566,14 @@ mem_cgroup_get_reclaim_stat_from_page(struct page *page) return NULL; pc = lookup_page_cgroup(page); + /* + * Used bit is set without atomic ops but after smp_wmb(). + * For making pc->mem_cgroup visible, insert smp_rmb() here. + */ + smp_rmb(); + if (!PageCgroupUsed(pc)) + return NULL; + mz = page_cgroup_zoneinfo(pc); if (!mz) return NULL; -- cgit v1.1 From 40d58138f832a48208cdce57d6572a033b1f7a23 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 15 Jan 2009 13:51:12 -0800 Subject: memcg: fix error path of mem_cgroup_move_parent There is a bug in error path of mem_cgroup_move_parent. Extra refcnt got from try_charge should be dropped, and usages incremented by try_charge should be decremented in both error paths: A: failure at get_page_unless_zero B: failure at isolate_lru_page This bug makes this parent directory unremovable. In case of A, rmdir doesn't return, because res.usage doesn't go down to 0 at mem_cgroup_force_empty even after all the pc in lru are removed. In case of B, rmdir fails and returns -EBUSY, because it has extra ref counts even after res.usage goes down to 0. Signed-off-by: Daisuke Nishimura Acked-by: KAMEZAWA Hiroyuki Acked-by: Balbir Singh Cc: Pavel Emelyanov Cc: Li Zefan Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b665127..7be9b35 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -994,14 +994,15 @@ static int mem_cgroup_move_account(struct page_cgroup *pc, if (pc->mem_cgroup != from) goto out; - css_put(&from->css); res_counter_uncharge(&from->res, PAGE_SIZE); mem_cgroup_charge_statistics(from, pc, false); if (do_swap_account) res_counter_uncharge(&from->memsw, PAGE_SIZE); + css_put(&from->css); + + css_get(&to->css); pc->mem_cgroup = to; mem_cgroup_charge_statistics(to, pc, true); - css_get(&to->css); ret = 0; out: unlock_page_cgroup(pc); @@ -1034,8 +1035,10 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, if (ret || !parent) return ret; - if (!get_page_unless_zero(page)) - return -EBUSY; + if (!get_page_unless_zero(page)) { + ret = -EBUSY; + goto uncharge; + } ret = isolate_lru_page(page); @@ -1044,19 +1047,23 @@ static int mem_cgroup_move_parent(struct page_cgroup *pc, ret = mem_cgroup_move_account(pc, child, parent); - /* drop extra refcnt by try_charge() (move_account increment one) */ - css_put(&parent->css); putback_lru_page(page); if (!ret) { put_page(page); + /* drop extra refcnt by try_charge() */ + css_put(&parent->css); return 0; } - /* uncharge if move fails */ + cancel: + put_page(page); +uncharge: + /* drop extra refcnt by try_charge() */ + css_put(&parent->css); + /* uncharge if move fails */ res_counter_uncharge(&parent->res, PAGE_SIZE); if (do_swap_account) res_counter_uncharge(&parent->memsw, PAGE_SIZE); - put_page(page); return ret; } -- cgit v1.1 From c268e9946d7dc30ac4e55cdc3f43c8af1ae8153c Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 15 Jan 2009 13:51:13 -0800 Subject: memcg: fix hierarchical reclaim If root_mem has no children, last_scaned_child is set to root_mem itself. But after some children added to root_mem, mem_cgroup_get_next_node can mem_cgroup_put the root_mem although root_mem has not been mem_cgroup_get. This patch fixes this behavior by: - Set last_scanned_child to NULL if root_mem has no children or DFS search has returned to root_mem itself(root_mem is not a "child" of root_mem). Make mem_cgroup_get_first_node return root_mem in this case. There are no mem_cgroup_get/put for root_mem. - Rename mem_cgroup_get_next_node to __mem_cgroup_get_next_node, and mem_cgroup_get_first_node to mem_cgroup_get_next_node. Make mem_cgroup_hierarchical_reclaim call only new mem_cgroup_get_next_node. Signed-off-by: Daisuke Nishimura Cc: Balbir Singh Cc: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Li Zefan Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 68 ++++++++++++++++++++++++++++++--------------------------- 1 file changed, 36 insertions(+), 32 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 7be9b35..322625f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -633,7 +633,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan, * called with hierarchy_mutex held */ static struct mem_cgroup * -mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) +__mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) { struct cgroup *cgroup, *curr_cgroup, *root_cgroup; @@ -644,19 +644,16 @@ mem_cgroup_get_next_node(struct mem_cgroup *curr, struct mem_cgroup *root_mem) /* * Walk down to children */ - mem_cgroup_put(curr); cgroup = list_entry(curr_cgroup->children.next, struct cgroup, sibling); curr = mem_cgroup_from_cont(cgroup); - mem_cgroup_get(curr); goto done; } visit_parent: if (curr_cgroup == root_cgroup) { - mem_cgroup_put(curr); - curr = root_mem; - mem_cgroup_get(curr); + /* caller handles NULL case */ + curr = NULL; goto done; } @@ -664,11 +661,9 @@ visit_parent: * Goto next sibling */ if (curr_cgroup->sibling.next != &curr_cgroup->parent->children) { - mem_cgroup_put(curr); cgroup = list_entry(curr_cgroup->sibling.next, struct cgroup, sibling); curr = mem_cgroup_from_cont(cgroup); - mem_cgroup_get(curr); goto done; } @@ -679,7 +674,6 @@ visit_parent: goto visit_parent; done: - root_mem->last_scanned_child = curr; return curr; } @@ -689,40 +683,46 @@ done: * that to reclaim free pages from. */ static struct mem_cgroup * -mem_cgroup_get_first_node(struct mem_cgroup *root_mem) +mem_cgroup_get_next_node(struct mem_cgroup *root_mem) { struct cgroup *cgroup; - struct mem_cgroup *ret; + struct mem_cgroup *orig, *next; bool obsolete; - obsolete = mem_cgroup_is_obsolete(root_mem->last_scanned_child); - /* * Scan all children under the mem_cgroup mem */ mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); + + orig = root_mem->last_scanned_child; + obsolete = mem_cgroup_is_obsolete(orig); + if (list_empty(&root_mem->css.cgroup->children)) { - ret = root_mem; + /* + * root_mem might have children before and last_scanned_child + * may point to one of them. We put it later. + */ + if (orig) + VM_BUG_ON(!obsolete); + next = NULL; goto done; } - if (!root_mem->last_scanned_child || obsolete) { - - if (obsolete && root_mem->last_scanned_child) - mem_cgroup_put(root_mem->last_scanned_child); - + if (!orig || obsolete) { cgroup = list_first_entry(&root_mem->css.cgroup->children, struct cgroup, sibling); - ret = mem_cgroup_from_cont(cgroup); - mem_cgroup_get(ret); + next = mem_cgroup_from_cont(cgroup); } else - ret = mem_cgroup_get_next_node(root_mem->last_scanned_child, - root_mem); + next = __mem_cgroup_get_next_node(orig, root_mem); done: - root_mem->last_scanned_child = ret; + if (next) + mem_cgroup_get(next); + root_mem->last_scanned_child = next; + if (orig) + mem_cgroup_put(orig); mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); - return ret; + return (next) ? next : root_mem; } static bool mem_cgroup_check_under_limit(struct mem_cgroup *mem) @@ -780,21 +780,18 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, if (!root_mem->use_hierarchy) return ret; - next_mem = mem_cgroup_get_first_node(root_mem); + next_mem = mem_cgroup_get_next_node(root_mem); while (next_mem != root_mem) { if (mem_cgroup_is_obsolete(next_mem)) { - mem_cgroup_put(next_mem); - next_mem = mem_cgroup_get_first_node(root_mem); + next_mem = mem_cgroup_get_next_node(root_mem); continue; } ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, get_swappiness(next_mem)); if (mem_cgroup_check_under_limit(root_mem)) return 0; - mutex_lock(&mem_cgroup_subsys.hierarchy_mutex); - next_mem = mem_cgroup_get_next_node(next_mem, root_mem); - mutex_unlock(&mem_cgroup_subsys.hierarchy_mutex); + next_mem = mem_cgroup_get_next_node(root_mem); } return ret; } @@ -2254,7 +2251,14 @@ static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss, static void mem_cgroup_destroy(struct cgroup_subsys *ss, struct cgroup *cont) { - mem_cgroup_put(mem_cgroup_from_cont(cont)); + struct mem_cgroup *mem = mem_cgroup_from_cont(cont); + struct mem_cgroup *last_scanned_child = mem->last_scanned_child; + + if (last_scanned_child) { + VM_BUG_ON(!mem_cgroup_is_obsolete(last_scanned_child)); + mem_cgroup_put(last_scanned_child); + } + mem_cgroup_put(mem); } static int mem_cgroup_populate(struct cgroup_subsys *ss, -- cgit v1.1 From 4d1c627389c8ba6d9e703208567ffcdbd356f682 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 15 Jan 2009 13:51:14 -0800 Subject: memcg: make oom less frequently In previous implementation, mem_cgroup_try_charge checked the return value of mem_cgroup_try_to_free_pages, and just retried if some pages had been reclaimed. But now, try_charge(and mem_cgroup_hierarchical_reclaim called from it) only checks whether the usage is less than the limit. This patch tries to change the behavior as before to cause oom less frequently. Signed-off-by: Daisuke Nishimura Acked-by: Balbir Singh Acked-by: KAMEZAWA Hiroyuki Cc: Pavel Emelyanov Cc: Li Zefan Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 322625f..fb62b43 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -773,10 +773,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, * but there might be left over accounting, even after children * have left. */ - ret = try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, + ret += try_to_free_mem_cgroup_pages(root_mem, gfp_mask, noswap, get_swappiness(root_mem)); if (mem_cgroup_check_under_limit(root_mem)) - return 0; + return 1; /* indicate reclaim has succeeded */ if (!root_mem->use_hierarchy) return ret; @@ -787,10 +787,10 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem, next_mem = mem_cgroup_get_next_node(root_mem); continue; } - ret = try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, + ret += try_to_free_mem_cgroup_pages(next_mem, gfp_mask, noswap, get_swappiness(next_mem)); if (mem_cgroup_check_under_limit(root_mem)) - return 0; + return 1; /* indicate reclaim has succeeded */ next_mem = mem_cgroup_get_next_node(root_mem); } return ret; @@ -875,6 +875,8 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, gfp_mask, noswap); + if (ret) + continue; /* * try_to_free_mem_cgroup_pages() might not give us a full -- cgit v1.1 From 46666d8ac42893f90edde7e57a11bc8749d7e89c Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 15 Jan 2009 13:51:15 -0800 Subject: revert "mm: vmalloc use mutex for purge" Revert commit e97a630eb0f5b8b380fd67504de6cedebb489003 ("mm: vmalloc use mutex for purge") Bryan Donlan reports: : After testing 2.6.29-rc1 on xen-x86 with a btrfs root filesystem, I : got the OOPS quoted below and a hard freeze shortly after boot. : Boot messages and config are attached. : : ------------[ cut here ]------------ : Kernel BUG at c05ef80d [verbose debug info unavailable] : invalid opcode: 0000 [#1] SMP : last sysfs file: /sys/block/xvdc/size : Modules linked in: : : Pid: 0, comm: swapper Not tainted (2.6.29-rc1 #6) : EIP: 0061:[] EFLAGS: 00010087 CPU: 2 : EIP is at schedule+0x7cd/0x950 : EAX: d5aeca80 EBX: 00000002 ECX: 00000000 EDX: d4cb9a40 : ESI: c12f5600 EDI: d4cb9a40 EBP: d6033fa4 ESP: d6033ef4 : DS: 007b ES: 007b FS: 00d8 GS: 0000 SS: 0069 : Process swapper (pid: 0, ti=d6032000 task=d6020b70 task.ti=d6032000) : Stack: : 000d85bc 00000000 000186a0 00000000 0dd11410 c0105417 c12efe00 0dc367c3 : 00000011 c0105d46 d5a5d310 deadbeef d4cb9a40 c07cc600 c05f1340 c12e0060 : deadbeef d6020b70 d6020d08 00000002 c014377d 00000000 c12f5600 00002c22 : Call Trace: : [] xen_force_evtchn_callback+0x17/0x30 : [] check_events+0x8/0x12 : [] _spin_unlock_irqrestore+0x20/0x40 : [] hrtimer_start_range_ns+0x12d/0x2e0 : [] tick_nohz_restart_sched_tick+0x146/0x160 : [] cpu_idle+0xa5/0xc0 and bisected it to this commit. Let's remove it now while we have a think about the problem. Reported-by: Bryan Donlan Tested-by: Christophe Saout Cc: Nick Piggin Cc: Ingo Molnar Cc: Jeremy Fitzhardinge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7e00b28..75f49d3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -496,7 +495,7 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0); static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, int sync, int force_flush) { - static DEFINE_MUTEX(purge_lock); + static DEFINE_SPINLOCK(purge_lock); LIST_HEAD(valist); struct vmap_area *va; int nr = 0; @@ -507,10 +506,10 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, * the case that isn't actually used at the moment anyway. */ if (!sync && !force_flush) { - if (!mutex_trylock(&purge_lock)) + if (!spin_trylock(&purge_lock)) return; } else - mutex_lock(&purge_lock); + spin_lock(&purge_lock); rcu_read_lock(); list_for_each_entry_rcu(va, &vmap_area_list, list) { @@ -542,7 +541,7 @@ static void __purge_vmap_area_lazy(unsigned long *start, unsigned long *end, __free_vmap_area(va); spin_unlock(&vmap_area_lock); } - mutex_unlock(&purge_lock); + spin_unlock(&purge_lock); } /* -- cgit v1.1 From 0eb253e223c88b982461e59154fcad1b82597592 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 15 Jan 2009 13:51:25 -0800 Subject: memcg: fix section mismatch At system boot when creating the top cgroup, mem_cgroup_create() calls enable_swap_cgroup() which is marked as __init, so mark mem_cgroup_create() as __ref to avoid false section mismatch warning. Reported-by: Rakib Mullick Signed-off-by: Li Zefan Acked-by; KAMEZAWA Hiroyuki Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index fb62b43..f0dc076 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2202,7 +2202,7 @@ static void __init enable_swap_cgroup(void) } #endif -static struct cgroup_subsys_state * +static struct cgroup_subsys_state * __ref mem_cgroup_create(struct cgroup_subsys *ss, struct cgroup *cont) { struct mem_cgroup *mem, *parent; -- cgit v1.1 From 068b38c1fa7a9210608f27ac521897ccc5f9b726 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 15 Jan 2009 13:51:26 -0800 Subject: memcg: fix a race when setting memory.swappiness (suppose: memcg->use_hierarchy == 0 and memcg->swappiness == 60) echo 10 > /memcg/0/swappiness | mem_cgroup_swappiness_write() | ... | echo 1 > /memcg/0/use_hierarchy | mkdir /mnt/0/1 | sub_memcg->swappiness = 60; memcg->swappiness = 10; | In the above scenario, we end up having 2 different swappiness values in a single hierarchy. We should hold cgroup_lock() when cheking cgrp->children list. Signed-off-by: Li Zefan Acked-by: KAMEZAWA Hiroyuki Cc: Balbir Singh Cc: Paul Menage Cc: Daisuke Nishimura Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f0dc076..4d0ea3c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1992,6 +1992,7 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, { struct mem_cgroup *memcg = mem_cgroup_from_cont(cgrp); struct mem_cgroup *parent; + if (val > 100) return -EINVAL; @@ -1999,15 +2000,22 @@ static int mem_cgroup_swappiness_write(struct cgroup *cgrp, struct cftype *cft, return -EINVAL; parent = mem_cgroup_from_cont(cgrp->parent); + + cgroup_lock(); + /* If under hierarchy, only empty-root can set this value */ if ((parent->use_hierarchy) || - (memcg->use_hierarchy && !list_empty(&cgrp->children))) + (memcg->use_hierarchy && !list_empty(&cgrp->children))) { + cgroup_unlock(); return -EINVAL; + } spin_lock(&memcg->reclaim_param_lock); memcg->swappiness = val; spin_unlock(&memcg->reclaim_param_lock); + cgroup_unlock(); + return 0; } -- cgit v1.1