summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
authorMel Gorman <mel@csn.ul.ie>2009-02-10 14:02:27 +0000
committerLinus Torvalds <torvalds@linux-foundation.org>2009-02-10 10:48:42 -0800
commit5a6fe125950676015f5108fb71b2a67441755003 (patch)
treec985fac46de39392466c4917c497b50bdc9c0757 /mm
parent4c098bcd55fad34dcf224bf8343db6a9ac58fc68 (diff)
downloadop-kernel-dev-5a6fe125950676015f5108fb71b2a67441755003.zip
op-kernel-dev-5a6fe125950676015f5108fb71b2a67441755003.tar.gz
Do not account for the address space used by hugetlbfs using VM_ACCOUNT
When overcommit is disabled, the core VM accounts for pages used by anonymous shared, private mappings and special mappings. It keeps track of VMAs that should be accounted for with VM_ACCOUNT and VMAs that never had a reserve with VM_NORESERVE. Overcommit for hugetlbfs is much riskier than overcommit for base pages due to contiguity requirements. It avoids overcommiting on both shared and private mappings using reservation counters that are checked and updated during mmap(). This ensures (within limits) that hugepages exist in the future when faults occurs or it is too easy to applications to be SIGKILLed. As hugetlbfs makes its own reservations of a different unit to the base page size, VM_ACCOUNT should never be set. Even if the units were correct, we would double account for the usage in the core VM and hugetlbfs. VM_NORESERVE may be set because an application can request no reserves be made for hugetlbfs at the risk of getting killed later. With commit fc8744adc870a8d4366908221508bb113d8b72ee, VM_NORESERVE and VM_ACCOUNT are getting unconditionally set for hugetlbfs-backed mappings. This breaks the accounting for both the core VM and hugetlbfs, can trigger an OOM storm when hugepage pools are too small lockups and corrupted counters otherwise are used. This patch brings hugetlbfs more in line with how the core VM treats VM_NORESERVE but prevents VM_ACCOUNT being set. Signed-off-by: Mel Gorman <mel@csn.ul.ie> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm')
-rw-r--r--mm/fremap.c2
-rw-r--r--mm/hugetlb.c39
-rw-r--r--mm/mmap.c38
-rw-r--r--mm/mprotect.c5
4 files changed, 51 insertions, 33 deletions
diff --git a/mm/fremap.c b/mm/fremap.c
index 736ba7f..b6ec85a 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -198,7 +198,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
flags &= MAP_NONBLOCK;
get_file(file);
addr = mmap_region(file, start, size,
- flags, vma->vm_flags, pgoff, 1);
+ flags, vma->vm_flags, pgoff);
fput(file);
if (IS_ERR_VALUE(addr)) {
err = addr;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 618e983..2074642 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2269,14 +2269,12 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
int hugetlb_reserve_pages(struct inode *inode,
long from, long to,
- struct vm_area_struct *vma)
+ struct vm_area_struct *vma,
+ int acctflag)
{
- long ret, chg;
+ long ret = 0, chg;
struct hstate *h = hstate_inode(inode);
- if (vma && vma->vm_flags & VM_NORESERVE)
- return 0;
-
/*
* Shared mappings base their reservation on the number of pages that
* are already allocated on behalf of the file. Private mappings need
@@ -2285,22 +2283,25 @@ int hugetlb_reserve_pages(struct inode *inode,
*/
if (!vma || vma->vm_flags & VM_SHARED)
chg = region_chg(&inode->i_mapping->private_list, from, to);
- else {
- struct resv_map *resv_map = resv_map_alloc();
- if (!resv_map)
- return -ENOMEM;
-
+ else
chg = to - from;
- set_vma_resv_map(vma, resv_map);
- set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
- }
-
if (chg < 0)
return chg;
if (hugetlb_get_quota(inode->i_mapping, chg))
return -ENOSPC;
+
+ /*
+ * Only apply hugepage reservation if asked. We still have to
+ * take the filesystem quota because it is an upper limit
+ * defined for the mount and not necessarily memory as a whole
+ */
+ if (acctflag & VM_NORESERVE) {
+ reset_vma_resv_huge_pages(vma);
+ return 0;
+ }
+
ret = hugetlb_acct_memory(h, chg);
if (ret < 0) {
hugetlb_put_quota(inode->i_mapping, chg);
@@ -2308,6 +2309,16 @@ int hugetlb_reserve_pages(struct inode *inode,
}
if (!vma || vma->vm_flags & VM_SHARED)
region_add(&inode->i_mapping->private_list, from, to);
+ else {
+ struct resv_map *resv_map = resv_map_alloc();
+
+ if (!resv_map)
+ return -ENOMEM;
+
+ set_vma_resv_map(vma, resv_map);
+ set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
+ }
+
return 0;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 214b6a2..eb1270b 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -918,7 +918,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
struct inode *inode;
unsigned int vm_flags;
int error;
- int accountable = 1;
unsigned long reqprot = prot;
/*
@@ -1019,8 +1018,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
return -EPERM;
vm_flags &= ~VM_MAYEXEC;
}
- if (is_file_hugepages(file))
- accountable = 0;
if (!file->f_op || !file->f_op->mmap)
return -ENODEV;
@@ -1053,8 +1050,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
if (error)
return error;
- return mmap_region(file, addr, len, flags, vm_flags, pgoff,
- accountable);
+ return mmap_region(file, addr, len, flags, vm_flags, pgoff);
}
EXPORT_SYMBOL(do_mmap_pgoff);
@@ -1092,17 +1088,23 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
/*
* We account for memory if it's a private writeable mapping,
- * and VM_NORESERVE wasn't set.
+ * not hugepages and VM_NORESERVE wasn't set.
*/
-static inline int accountable_mapping(unsigned int vm_flags)
+static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
{
+ /*
+ * hugetlb has its own accounting separate from the core VM
+ * VM_HUGETLB may not be set yet so we cannot check for that flag.
+ */
+ if (file && is_file_hugepages(file))
+ return 0;
+
return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
}
unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, unsigned long flags,
- unsigned int vm_flags, unsigned long pgoff,
- int accountable)
+ unsigned int vm_flags, unsigned long pgoff)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev;
@@ -1128,18 +1130,22 @@ munmap_back:
/*
* Set 'VM_NORESERVE' if we should not account for the
- * memory use of this mapping. We only honor MAP_NORESERVE
- * if we're allowed to overcommit memory.
+ * memory use of this mapping.
*/
- if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER)
- vm_flags |= VM_NORESERVE;
- if (!accountable)
- vm_flags |= VM_NORESERVE;
+ if ((flags & MAP_NORESERVE)) {
+ /* We honor MAP_NORESERVE if allowed to overcommit */
+ if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
+ vm_flags |= VM_NORESERVE;
+
+ /* hugetlb applies strict overcommit unless MAP_NORESERVE */
+ if (file && is_file_hugepages(file))
+ vm_flags |= VM_NORESERVE;
+ }
/*
* Private writable mapping: check memory availability
*/
- if (accountable_mapping(vm_flags)) {
+ if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT;
if (security_vm_enough_memory(charged))
return -ENOMEM;
diff --git a/mm/mprotect.c b/mm/mprotect.c
index abe2694..258197b 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -151,10 +151,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
/*
* If we make a private mapping writable we increase our commit;
* but (without finer accounting) cannot reduce our commit if we
- * make it unwritable again.
+ * make it unwritable again. hugetlb mapping were accounted for
+ * even if read-only so there is no need to account for them here
*/
if (newflags & VM_WRITE) {
- if (!(oldflags & (VM_ACCOUNT|VM_WRITE|
+ if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
VM_SHARED|VM_NORESERVE))) {
charged = nrpages;
if (security_vm_enough_memory(charged))
OpenPOWER on IntegriCloud