summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig10
-rw-r--r--mm/Makefile7
-rw-r--r--mm/backing-dev.c12
-rw-r--r--mm/bootmem.c13
-rw-r--r--mm/bounce.c2
-rw-r--r--mm/compaction.c7
-rw-r--r--mm/fremap.c7
-rw-r--r--mm/hugetlb.c106
-rw-r--r--mm/hwpoison-inject.c15
-rw-r--r--mm/ksm.c9
-rw-r--r--mm/memblock.c837
-rw-r--r--mm/memcontrol.c417
-rw-r--r--mm/memory-failure.c132
-rw-r--r--mm/memory.c89
-rw-r--r--mm/memory_hotplug.c18
-rw-r--r--mm/mlock.c13
-rw-r--r--mm/mmap.c25
-rw-r--r--mm/mmzone.c21
-rw-r--r--mm/nommu.c12
-rw-r--r--mm/oom_kill.c67
-rw-r--r--mm/page-writeback.c215
-rw-r--r--mm/page_alloc.c123
-rw-r--r--mm/percpu-km.c8
-rw-r--r--mm/percpu.c407
-rw-r--r--mm/percpu_up.c30
-rw-r--r--mm/rmap.c89
-rw-r--r--mm/shmem.c8
-rw-r--r--mm/slab.c4
-rw-r--r--mm/slob.c4
-rw-r--r--mm/slub.c788
-rw-r--r--mm/sparse-vmemmap.c11
-rw-r--r--mm/swapfile.c129
-rw-r--r--mm/util.c13
-rw-r--r--mm/vmalloc.c15
-rw-r--r--mm/vmscan.c58
-rw-r--r--mm/vmstat.c16
36 files changed, 2315 insertions, 1422 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index f4e516e..c2c8a4a 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -189,7 +189,7 @@ config COMPACTION
config MIGRATION
bool "Page migration"
def_bool y
- depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE
+ depends on NUMA || ARCH_ENABLE_MEMORY_HOTREMOVE || COMPACTION
help
Allows the migration of the physical location of pages of processes
while the virtual addresses are not changed. This is useful in
@@ -301,3 +301,11 @@ config NOMMU_INITIAL_TRIM_EXCESS
of 1 says that all excess pages should be trimmed.
See Documentation/nommu-mmap.txt for more information.
+
+#
+# UP and nommu archs use km based percpu allocator
+#
+config NEED_PER_CPU_KM
+ depends on !SMP
+ bool
+ default y
diff --git a/mm/Makefile b/mm/Makefile
index 34b2546..f73f75a 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -11,7 +11,7 @@ obj-y := bootmem.o filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page_alloc.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
prio_tree.o util.o mmzone.o vmstat.o backing-dev.o \
- page_isolation.o mm_init.o mmu_context.o \
+ page_isolation.o mm_init.o mmu_context.o percpu.o \
$(mmu-y)
obj-y += init-mm.o
@@ -36,11 +36,6 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
obj-$(CONFIG_FS_XIP) += filemap_xip.o
obj-$(CONFIG_MIGRATION) += migrate.o
-ifdef CONFIG_SMP
-obj-y += percpu.o
-else
-obj-y += percpu_up.o
-endif
obj-$(CONFIG_QUICKLIST) += quicklist.o
obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 08d3575..65d4204 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -30,6 +30,7 @@ EXPORT_SYMBOL_GPL(default_backing_dev_info);
struct backing_dev_info noop_backing_dev_info = {
.name = "noop",
+ .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
@@ -81,7 +82,8 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v)
nr_more_io++;
spin_unlock(&inode_lock);
- get_dirty_limits(&background_thresh, &dirty_thresh, &bdi_thresh, bdi);
+ global_dirty_limits(&background_thresh, &dirty_thresh);
+ bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
#define K(x) ((x) << (PAGE_SHIFT - 10))
seq_printf(m,
@@ -242,6 +244,7 @@ static int __init default_bdi_init(void)
err = bdi_init(&default_backing_dev_info);
if (!err)
bdi_register(&default_backing_dev_info, NULL, "default");
+ err = bdi_init(&noop_backing_dev_info);
return err;
}
@@ -444,8 +447,8 @@ static int bdi_forker_thread(void *ptr)
switch (action) {
case FORK_THREAD:
__set_current_state(TASK_RUNNING);
- task = kthread_run(bdi_writeback_thread, &bdi->wb, "flush-%s",
- dev_name(bdi->dev));
+ task = kthread_create(bdi_writeback_thread, &bdi->wb,
+ "flush-%s", dev_name(bdi->dev));
if (IS_ERR(task)) {
/*
* If thread creation fails, force writeout of
@@ -456,10 +459,13 @@ static int bdi_forker_thread(void *ptr)
/*
* The spinlock makes sure we do not lose
* wake-ups when racing with 'bdi_queue_work()'.
+ * And as soon as the bdi thread is visible, we
+ * can start it.
*/
spin_lock_bh(&bdi->wb_lock);
bdi->wb.task = task;
spin_unlock_bh(&bdi->wb_lock);
+ wake_up_process(task);
}
break;
diff --git a/mm/bootmem.c b/mm/bootmem.c
index 142c84a..13b0caa 100644
--- a/mm/bootmem.c
+++ b/mm/bootmem.c
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/kmemleak.h>
#include <linux/range.h>
+#include <linux/memblock.h>
#include <asm/bug.h>
#include <asm/io.h>
@@ -434,7 +435,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
unsigned long size)
{
#ifdef CONFIG_NO_BOOTMEM
- free_early(physaddr, physaddr + size);
+ kmemleak_free_part(__va(physaddr), size);
+ memblock_x86_free_range(physaddr, physaddr + size);
#else
unsigned long start, end;
@@ -459,7 +461,8 @@ void __init free_bootmem_node(pg_data_t *pgdat, unsigned long physaddr,
void __init free_bootmem(unsigned long addr, unsigned long size)
{
#ifdef CONFIG_NO_BOOTMEM
- free_early(addr, addr + size);
+ kmemleak_free_part(__va(addr), size);
+ memblock_x86_free_range(addr, addr + size);
#else
unsigned long start, end;
@@ -526,6 +529,12 @@ int __init reserve_bootmem(unsigned long addr, unsigned long size,
}
#ifndef CONFIG_NO_BOOTMEM
+int __weak __init reserve_bootmem_generic(unsigned long phys, unsigned long len,
+ int flags)
+{
+ return reserve_bootmem(phys, len, flags);
+}
+
static unsigned long __init align_idx(struct bootmem_data *bdata,
unsigned long idx, unsigned long step)
{
diff --git a/mm/bounce.c b/mm/bounce.c
index 13b6dad..1481de6 100644
--- a/mm/bounce.c
+++ b/mm/bounce.c
@@ -116,8 +116,8 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
*/
vfrom = page_address(fromvec->bv_page) + tovec->bv_offset;
- flush_dcache_page(tovec->bv_page);
bounce_copy_vec(tovec, vfrom);
+ flush_dcache_page(tovec->bv_page);
}
}
diff --git a/mm/compaction.c b/mm/compaction.c
index 94cce51..4d709ee 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -214,15 +214,16 @@ static void acct_isolated(struct zone *zone, struct compact_control *cc)
/* Similar to reclaim, but different enough that they don't share logic */
static bool too_many_isolated(struct zone *zone)
{
-
- unsigned long inactive, isolated;
+ unsigned long active, inactive, isolated;
inactive = zone_page_state(zone, NR_INACTIVE_FILE) +
zone_page_state(zone, NR_INACTIVE_ANON);
+ active = zone_page_state(zone, NR_ACTIVE_FILE) +
+ zone_page_state(zone, NR_ACTIVE_ANON);
isolated = zone_page_state(zone, NR_ISOLATED_FILE) +
zone_page_state(zone, NR_ISOLATED_ANON);
- return isolated > inactive;
+ return isolated > (inactive + active) / 2;
}
/*
diff --git a/mm/fremap.c b/mm/fremap.c
index 46f5dac..ec520c7 100644
--- a/mm/fremap.c
+++ b/mm/fremap.c
@@ -125,7 +125,6 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
{
struct mm_struct *mm = current->mm;
struct address_space *mapping;
- unsigned long end = start + size;
struct vm_area_struct *vma;
int err = -EINVAL;
int has_write_lock = 0;
@@ -142,6 +141,10 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (start + size <= start)
return err;
+ /* Does pgoff wrap? */
+ if (pgoff + (size >> PAGE_SHIFT) < pgoff)
+ return err;
+
/* Can we represent this offset inside this architecture's pte's? */
#if PTE_FILE_MAX_BITS < BITS_PER_LONG
if (pgoff + (size >> PAGE_SHIFT) >= (1UL << PTE_FILE_MAX_BITS))
@@ -168,7 +171,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (!(vma->vm_flags & VM_CAN_NONLINEAR))
goto out;
- if (end <= start || start < vma->vm_start || end > vma->vm_end)
+ if (start < vma->vm_start || start + size > vma->vm_end)
goto out;
/* Must set VM_NONLINEAR before any pages are populated. */
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index b61d2db..c032738 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -18,6 +18,9 @@
#include <linux/bootmem.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
+#include <linux/rmap.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
#include <asm/page.h>
#include <asm/pgtable.h>
@@ -220,6 +223,12 @@ static pgoff_t vma_hugecache_offset(struct hstate *h,
(vma->vm_pgoff >> huge_page_order(h));
}
+pgoff_t linear_hugepage_index(struct vm_area_struct *vma,
+ unsigned long address)
+{
+ return vma_hugecache_offset(hstate_vma(vma), vma, address);
+}
+
/*
* Return the size of the pages allocated when backing a VMA. In the majority
* cases this will be same size as used by the page table entries.
@@ -552,6 +561,7 @@ static void free_huge_page(struct page *page)
set_page_private(page, 0);
page->mapping = NULL;
BUG_ON(page_count(page));
+ BUG_ON(page_mapcount(page));
INIT_LIST_HEAD(&page->lru);
spin_lock(&hugetlb_lock);
@@ -605,6 +615,8 @@ int PageHuge(struct page *page)
return dtor == free_huge_page;
}
+EXPORT_SYMBOL_GPL(PageHuge);
+
static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid)
{
struct page *page;
@@ -2129,6 +2141,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src,
entry = huge_ptep_get(src_pte);
ptepage = pte_page(entry);
get_page(ptepage);
+ page_dup_rmap(ptepage);
set_huge_pte_at(dst, addr, dst_pte, entry);
}
spin_unlock(&src->page_table_lock);
@@ -2140,6 +2153,19 @@ nomem:
return -ENOMEM;
}
+static int is_hugetlb_entry_hwpoisoned(pte_t pte)
+{
+ swp_entry_t swp;
+
+ if (huge_pte_none(pte) || pte_present(pte))
+ return 0;
+ swp = pte_to_swp_entry(pte);
+ if (non_swap_entry(swp) && is_hwpoison_entry(swp)) {
+ return 1;
+ } else
+ return 0;
+}
+
void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
unsigned long end, struct page *ref_page)
{
@@ -2198,6 +2224,12 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
if (huge_pte_none(pte))
continue;
+ /*
+ * HWPoisoned hugepage is already unmapped and dropped reference
+ */
+ if (unlikely(is_hugetlb_entry_hwpoisoned(pte)))
+ continue;
+
page = pte_page(pte);
if (pte_dirty(pte))
set_page_dirty(page);
@@ -2207,6 +2239,7 @@ void __unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start,
flush_tlb_range(vma, start, end);
mmu_notifier_invalidate_range_end(mm, start, end);
list_for_each_entry_safe(page, tmp, &page_list, lru) {
+ page_remove_rmap(page);
list_del(&page->lru);
put_page(page);
}
@@ -2272,6 +2305,9 @@ static int unmap_ref_private(struct mm_struct *mm, struct vm_area_struct *vma,
return 1;
}
+/*
+ * Hugetlb_cow() should be called with page lock of the original hugepage held.
+ */
static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, pte_t *ptep, pte_t pte,
struct page *pagecache_page)
@@ -2286,8 +2322,10 @@ static int hugetlb_cow(struct mm_struct *mm, struct vm_area_struct *vma,
retry_avoidcopy:
/* If no-one else is actually using this page, avoid the copy
* and just make the page writable */
- avoidcopy = (page_count(old_page) == 1);
+ avoidcopy = (page_mapcount(old_page) == 1);
if (avoidcopy) {
+ if (PageAnon(old_page))
+ page_move_anon_rmap(old_page, vma, address);
set_huge_ptep_writable(vma, address, ptep);
return 0;
}
@@ -2338,6 +2376,13 @@ retry_avoidcopy:
return -PTR_ERR(new_page);
}
+ /*
+ * When the original hugepage is shared one, it does not have
+ * anon_vma prepared.
+ */
+ if (unlikely(anon_vma_prepare(vma)))
+ return VM_FAULT_OOM;
+
copy_huge_page(new_page, old_page, address, vma);
__SetPageUptodate(new_page);
@@ -2355,6 +2400,8 @@ retry_avoidcopy:
huge_ptep_clear_flush(vma, address, ptep);
set_huge_pte_at(mm, address, ptep,
make_huge_pte(vma, new_page, 1));
+ page_remove_rmap(old_page);
+ hugepage_add_new_anon_rmap(new_page, vma, address);
/* Make the old page be freed below */
new_page = old_page;
mmu_notifier_invalidate_range_end(mm,
@@ -2458,10 +2505,29 @@ retry:
spin_lock(&inode->i_lock);
inode->i_blocks += blocks_per_huge_page(h);
spin_unlock(&inode->i_lock);
+ page_dup_rmap(page);
} else {
lock_page(page);
- page->mapping = HUGETLB_POISON;
+ if (unlikely(anon_vma_prepare(vma))) {
+ ret = VM_FAULT_OOM;
+ goto backout_unlocked;
+ }
+ hugepage_add_new_anon_rmap(page, vma, address);
}
+ } else {
+ page_dup_rmap(page);
+ }
+
+ /*
+ * Since memory error handler replaces pte into hwpoison swap entry
+ * at the time of error handling, a process which reserved but not have
+ * the mapping to the error hugepage does not have hwpoison swap entry.
+ * So we need to block accesses from such a process by checking
+ * PG_hwpoison bit here.
+ */
+ if (unlikely(PageHWPoison(page))) {
+ ret = VM_FAULT_HWPOISON;
+ goto backout_unlocked;
}
/*
@@ -2513,10 +2579,18 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
pte_t *ptep;
pte_t entry;
int ret;
+ struct page *page = NULL;
struct page *pagecache_page = NULL;
static DEFINE_MUTEX(hugetlb_instantiation_mutex);
struct hstate *h = hstate_vma(vma);
+ ptep = huge_pte_offset(mm, address);
+ if (ptep) {
+ entry = huge_ptep_get(ptep);
+ if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
+ return VM_FAULT_HWPOISON;
+ }
+
ptep = huge_pte_alloc(mm, address, huge_page_size(h));
if (!ptep)
return VM_FAULT_OOM;
@@ -2554,6 +2628,17 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
vma, address);
}
+ /*
+ * hugetlb_cow() requires page locks of pte_page(entry) and
+ * pagecache_page, so here we need take the former one
+ * when page != pagecache_page or !pagecache_page.
+ * Note that locking order is always pagecache_page -> page,
+ * so no worry about deadlock.
+ */
+ page = pte_page(entry);
+ if (page != pagecache_page)
+ lock_page(page);
+
spin_lock(&mm->page_table_lock);
/* Check for a racing update before calling hugetlb_cow */
if (unlikely(!pte_same(entry, huge_ptep_get(ptep))))
@@ -2580,6 +2665,7 @@ out_page_table_lock:
unlock_page(pagecache_page);
put_page(pagecache_page);
}
+ unlock_page(page);
out_mutex:
mutex_unlock(&hugetlb_instantiation_mutex);
@@ -2791,3 +2877,19 @@ void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
hugetlb_put_quota(inode->i_mapping, (chg - freed));
hugetlb_acct_memory(h, -(chg - freed));
}
+
+/*
+ * This function is called from memory failure code.
+ * Assume the caller holds page lock of the head page.
+ */
+void __isolate_hwpoisoned_huge_page(struct page *hpage)
+{
+ struct hstate *h = page_hstate(hpage);
+ int nid = page_to_nid(hpage);
+
+ spin_lock(&hugetlb_lock);
+ list_del(&hpage->lru);
+ h->free_huge_pages--;
+ h->free_huge_pages_node[nid]--;
+ spin_unlock(&hugetlb_lock);
+}
diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c
index 10ea719..0948f10 100644
--- a/mm/hwpoison-inject.c
+++ b/mm/hwpoison-inject.c
@@ -5,6 +5,7 @@
#include <linux/mm.h>
#include <linux/swap.h>
#include <linux/pagemap.h>
+#include <linux/hugetlb.h>
#include "internal.h"
static struct dentry *hwpoison_dir;
@@ -13,6 +14,7 @@ static int hwpoison_inject(void *data, u64 val)
{
unsigned long pfn = val;
struct page *p;
+ struct page *hpage;
int err;
if (!capable(CAP_SYS_ADMIN))
@@ -24,18 +26,19 @@ static int hwpoison_inject(void *data, u64 val)
return -ENXIO;
p = pfn_to_page(pfn);
+ hpage = compound_head(p);
/*
* This implies unable to support free buddy pages.
*/
- if (!get_page_unless_zero(p))
+ if (!get_page_unless_zero(hpage))
return 0;
- if (!PageLRU(p))
+ if (!PageLRU(p) && !PageHuge(p))
shake_page(p, 0);
/*
* This implies unable to support non-LRU pages.
*/
- if (!PageLRU(p))
+ if (!PageLRU(p) && !PageHuge(p))
return 0;
/*
@@ -44,9 +47,9 @@ static int hwpoison_inject(void *data, u64 val)
* We temporarily take page lock for try_get_mem_cgroup_from_page().
* __memory_failure() will redo the check reliably inside page lock.
*/
- lock_page(p);
- err = hwpoison_filter(p);
- unlock_page(p);
+ lock_page(hpage);
+ err = hwpoison_filter(hpage);
+ unlock_page(hpage);
if (err)
return 0;
diff --git a/mm/ksm.c b/mm/ksm.c
index e2ae004..65ab5c7 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -712,7 +712,7 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
if (!ptep)
goto out;
- if (pte_write(*ptep)) {
+ if (pte_write(*ptep) || pte_dirty(*ptep)) {
pte_t entry;
swapped = PageSwapCache(page);
@@ -735,7 +735,9 @@ static int write_protect_page(struct vm_area_struct *vma, struct page *page,
set_pte_at(mm, addr, ptep, entry);
goto out_unlock;
}
- entry = pte_wrprotect(entry);
+ if (pte_dirty(entry))
+ set_page_dirty(page);
+ entry = pte_mkclean(pte_wrprotect(entry));
set_pte_at_notify(mm, addr, ptep, entry);
}
*orig_pte = *ptep;
@@ -1504,8 +1506,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
{
struct page *new_page;
- unlock_page(page); /* any racers will COW it, not modify it */
-
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
if (new_page) {
copy_user_highpage(new_page, page, address, vma);
@@ -1521,7 +1521,6 @@ struct page *ksm_does_need_to_copy(struct page *page,
add_page_to_unevictable_list(new_page);
}
- page_cache_release(page);
return new_page;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index 43840b3..400dc62 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -11,237 +11,423 @@
*/
#include <linux/kernel.h>
+#include <linux/slab.h>
#include <linux/init.h>
#include <linux/bitops.h>
+#include <linux/poison.h>
+#include <linux/pfn.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
#include <linux/memblock.h>
-#define MEMBLOCK_ALLOC_ANYWHERE 0
+struct memblock memblock __initdata_memblock;
-struct memblock memblock;
+int memblock_debug __initdata_memblock;
+int memblock_can_resize __initdata_memblock;
+static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
+static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS + 1] __initdata_memblock;
-static int memblock_debug;
+/* inline so we don't get a warning when pr_debug is compiled out */
+static inline const char *memblock_type_name(struct memblock_type *type)
+{
+ if (type == &memblock.memory)
+ return "memory";
+ else if (type == &memblock.reserved)
+ return "reserved";
+ else
+ return "unknown";
+}
-static int __init early_memblock(char *p)
+/*
+ * Address comparison utilities
+ */
+
+static phys_addr_t __init_memblock memblock_align_down(phys_addr_t addr, phys_addr_t size)
{
- if (p && strstr(p, "debug"))
- memblock_debug = 1;
+ return addr & ~(size - 1);
+}
+
+static phys_addr_t __init_memblock memblock_align_up(phys_addr_t addr, phys_addr_t size)
+{
+ return (addr + (size - 1)) & ~(size - 1);
+}
+
+static unsigned long __init_memblock memblock_addrs_overlap(phys_addr_t base1, phys_addr_t size1,
+ phys_addr_t base2, phys_addr_t size2)
+{
+ return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
+}
+
+static long __init_memblock memblock_addrs_adjacent(phys_addr_t base1, phys_addr_t size1,
+ phys_addr_t base2, phys_addr_t size2)
+{
+ if (base2 == base1 + size1)
+ return 1;
+ else if (base1 == base2 + size2)
+ return -1;
+
return 0;
}
-early_param("memblock", early_memblock);
-static void memblock_dump(struct memblock_region *region, char *name)
+static long __init_memblock memblock_regions_adjacent(struct memblock_type *type,
+ unsigned long r1, unsigned long r2)
{
- unsigned long long base, size;
- int i;
+ phys_addr_t base1 = type->regions[r1].base;
+ phys_addr_t size1 = type->regions[r1].size;
+ phys_addr_t base2 = type->regions[r2].base;
+ phys_addr_t size2 = type->regions[r2].size;
- pr_info(" %s.cnt = 0x%lx\n", name, region->cnt);
+ return memblock_addrs_adjacent(base1, size1, base2, size2);
+}
- for (i = 0; i < region->cnt; i++) {
- base = region->region[i].base;
- size = region->region[i].size;
+long __init_memblock memblock_overlaps_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
+{
+ unsigned long i;
- pr_info(" %s[0x%x]\t0x%016llx - 0x%016llx, 0x%llx bytes\n",
- name, i, base, base + size - 1, size);
+ for (i = 0; i < type->cnt; i++) {
+ phys_addr_t rgnbase = type->regions[i].base;
+ phys_addr_t rgnsize = type->regions[i].size;
+ if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
+ break;
}
+
+ return (i < type->cnt) ? i : -1;
}
-void memblock_dump_all(void)
+/*
+ * Find, allocate, deallocate or reserve unreserved regions. All allocations
+ * are top-down.
+ */
+
+static phys_addr_t __init_memblock memblock_find_region(phys_addr_t start, phys_addr_t end,
+ phys_addr_t size, phys_addr_t align)
{
- if (!memblock_debug)
- return;
+ phys_addr_t base, res_base;
+ long j;
- pr_info("MEMBLOCK configuration:\n");
- pr_info(" rmo_size = 0x%llx\n", (unsigned long long)memblock.rmo_size);
- pr_info(" memory.size = 0x%llx\n", (unsigned long long)memblock.memory.size);
+ /* In case, huge size is requested */
+ if (end < size)
+ return MEMBLOCK_ERROR;
- memblock_dump(&memblock.memory, "memory");
- memblock_dump(&memblock.reserved, "reserved");
+ base = memblock_align_down((end - size), align);
+
+ /* Prevent allocations returning 0 as it's also used to
+ * indicate an allocation failure
+ */
+ if (start == 0)
+ start = PAGE_SIZE;
+
+ while (start <= base) {
+ j = memblock_overlaps_region(&memblock.reserved, base, size);
+ if (j < 0)
+ return base;
+ res_base = memblock.reserved.regions[j].base;
+ if (res_base < size)
+ break;
+ base = memblock_align_down(res_base - size, align);
+ }
+
+ return MEMBLOCK_ERROR;
}
-static unsigned long memblock_addrs_overlap(u64 base1, u64 size1, u64 base2,
- u64 size2)
+static phys_addr_t __init_memblock memblock_find_base(phys_addr_t size,
+ phys_addr_t align, phys_addr_t start, phys_addr_t end)
{
- return ((base1 < (base2 + size2)) && (base2 < (base1 + size1)));
+ long i;
+
+ BUG_ON(0 == size);
+
+ size = memblock_align_up(size, align);
+
+ /* Pump up max_addr */
+ if (end == MEMBLOCK_ALLOC_ACCESSIBLE)
+ end = memblock.current_limit;
+
+ /* We do a top-down search, this tends to limit memory
+ * fragmentation by keeping early boot allocs near the
+ * top of memory
+ */
+ for (i = memblock.memory.cnt - 1; i >= 0; i--) {
+ phys_addr_t memblockbase = memblock.memory.regions[i].base;
+ phys_addr_t memblocksize = memblock.memory.regions[i].size;
+ phys_addr_t bottom, top, found;
+
+ if (memblocksize < size)
+ continue;
+ if ((memblockbase + memblocksize) <= start)
+ break;
+ bottom = max(memblockbase, start);
+ top = min(memblockbase + memblocksize, end);
+ if (bottom >= top)
+ continue;
+ found = memblock_find_region(bottom, top, size, align);
+ if (found != MEMBLOCK_ERROR)
+ return found;
+ }
+ return MEMBLOCK_ERROR;
}
-static long memblock_addrs_adjacent(u64 base1, u64 size1, u64 base2, u64 size2)
+/*
+ * Find a free area with specified alignment in a specific range.
+ */
+u64 __init_memblock memblock_find_in_range(u64 start, u64 end, u64 size, u64 align)
{
- if (base2 == base1 + size1)
- return 1;
- else if (base1 == base2 + size2)
- return -1;
+ return memblock_find_base(size, align, start, end);
+}
- return 0;
+/*
+ * Free memblock.reserved.regions
+ */
+int __init_memblock memblock_free_reserved_regions(void)
+{
+ if (memblock.reserved.regions == memblock_reserved_init_regions)
+ return 0;
+
+ return memblock_free(__pa(memblock.reserved.regions),
+ sizeof(struct memblock_region) * memblock.reserved.max);
}
-static long memblock_regions_adjacent(struct memblock_region *rgn,
- unsigned long r1, unsigned long r2)
+/*
+ * Reserve memblock.reserved.regions
+ */
+int __init_memblock memblock_reserve_reserved_regions(void)
{
- u64 base1 = rgn->region[r1].base;
- u64 size1 = rgn->region[r1].size;
- u64 base2 = rgn->region[r2].base;
- u64 size2 = rgn->region[r2].size;
+ if (memblock.reserved.regions == memblock_reserved_init_regions)
+ return 0;
- return memblock_addrs_adjacent(base1, size1, base2, size2);
+ return memblock_reserve(__pa(memblock.reserved.regions),
+ sizeof(struct memblock_region) * memblock.reserved.max);
}
-static void memblock_remove_region(struct memblock_region *rgn, unsigned long r)
+static void __init_memblock memblock_remove_region(struct memblock_type *type, unsigned long r)
{
unsigned long i;
- for (i = r; i < rgn->cnt - 1; i++) {
- rgn->region[i].base = rgn->region[i + 1].base;
- rgn->region[i].size = rgn->region[i + 1].size;
+ for (i = r; i < type->cnt - 1; i++) {
+ type->regions[i].base = type->regions[i + 1].base;
+ type->regions[i].size = type->regions[i + 1].size;
}
- rgn->cnt--;
+ type->cnt--;
}
/* Assumption: base addr of region 1 < base addr of region 2 */
-static void memblock_coalesce_regions(struct memblock_region *rgn,
+static void __init_memblock memblock_coalesce_regions(struct memblock_type *type,
unsigned long r1, unsigned long r2)
{
- rgn->region[r1].size += rgn->region[r2].size;
- memblock_remove_region(rgn, r2);
+ type->regions[r1].size += type->regions[r2].size;
+ memblock_remove_region(type, r2);
}
-void __init memblock_init(void)
+/* Defined below but needed now */
+static long memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size);
+
+static int __init_memblock memblock_double_array(struct memblock_type *type)
{
- /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
- * This simplifies the memblock_add() code below...
+ struct memblock_region *new_array, *old_array;
+ phys_addr_t old_size, new_size, addr;
+ int use_slab = slab_is_available();
+
+ /* We don't allow resizing until we know about the reserved regions
+ * of memory that aren't suitable for allocation
*/
- memblock.memory.region[0].base = 0;
- memblock.memory.region[0].size = 0;
- memblock.memory.cnt = 1;
+ if (!memblock_can_resize)
+ return -1;
- /* Ditto. */
- memblock.reserved.region[0].base = 0;
- memblock.reserved.region[0].size = 0;
- memblock.reserved.cnt = 1;
-}
+ /* Calculate new doubled size */
+ old_size = type->max * sizeof(struct memblock_region);
+ new_size = old_size << 1;
+
+ /* Try to find some space for it.
+ *
+ * WARNING: We assume that either slab_is_available() and we use it or
+ * we use MEMBLOCK for allocations. That means that this is unsafe to use
+ * when bootmem is currently active (unless bootmem itself is implemented
+ * on top of MEMBLOCK which isn't the case yet)
+ *
+ * This should however not be an issue for now, as we currently only
+ * call into MEMBLOCK while it's still active, or much later when slab is
+ * active for memory hotplug operations
+ */
+ if (use_slab) {
+ new_array = kmalloc(new_size, GFP_KERNEL);
+ addr = new_array == NULL ? MEMBLOCK_ERROR : __pa(new_array);
+ } else
+ addr = memblock_find_base(new_size, sizeof(phys_addr_t), 0, MEMBLOCK_ALLOC_ACCESSIBLE);
+ if (addr == MEMBLOCK_ERROR) {
+ pr_err("memblock: Failed to double %s array from %ld to %ld entries !\n",
+ memblock_type_name(type), type->max, type->max * 2);
+ return -1;
+ }
+ new_array = __va(addr);
-void __init memblock_analyze(void)
-{
- int i;
+ memblock_dbg("memblock: %s array is doubled to %ld at [%#010llx-%#010llx]",
+ memblock_type_name(type), type->max * 2, (u64)addr, (u64)addr + new_size - 1);
- memblock.memory.size = 0;
+ /* Found space, we now need to move the array over before
+ * we add the reserved region since it may be our reserved
+ * array itself that is full.
+ */
+ memcpy(new_array, type->regions, old_size);
+ memset(new_array + type->max, 0, old_size);
+ old_array = type->regions;
+ type->regions = new_array;
+ type->max <<= 1;
+
+ /* If we use SLAB that's it, we are done */
+ if (use_slab)
+ return 0;
- for (i = 0; i < memblock.memory.cnt; i++)
- memblock.memory.size += memblock.memory.region[i].size;
+ /* Add the new reserved region now. Should not fail ! */
+ BUG_ON(memblock_add_region(&memblock.reserved, addr, new_size) < 0);
+
+ /* If the array wasn't our static init one, then free it. We only do
+ * that before SLAB is available as later on, we don't know whether
+ * to use kfree or free_bootmem_pages(). Shouldn't be a big deal
+ * anyways
+ */
+ if (old_array != memblock_memory_init_regions &&
+ old_array != memblock_reserved_init_regions)
+ memblock_free(__pa(old_array), old_size);
+
+ return 0;
}
-static long memblock_add_region(struct memblock_region *rgn, u64 base, u64 size)
+extern int __init_memblock __weak memblock_memory_can_coalesce(phys_addr_t addr1, phys_addr_t size1,
+ phys_addr_t addr2, phys_addr_t size2)
+{
+ return 1;
+}
+
+static long __init_memblock memblock_add_region(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
{
unsigned long coalesced = 0;
long adjacent, i;
- if ((rgn->cnt == 1) && (rgn->region[0].size == 0)) {
- rgn->region[0].base = base;
- rgn->region[0].size = size;
+ if ((type->cnt == 1) && (type->regions[0].size == 0)) {
+ type->regions[0].base = base;
+ type->regions[0].size = size;
return 0;
}
/* First try and coalesce this MEMBLOCK with another. */
- for (i = 0; i < rgn->cnt; i++) {
- u64 rgnbase = rgn->region[i].base;
- u64 rgnsize = rgn->region[i].size;
+ for (i = 0; i < type->cnt; i++) {
+ phys_addr_t rgnbase = type->regions[i].base;
+ phys_addr_t rgnsize = type->regions[i].size;
if ((rgnbase == base) && (rgnsize == size))
/* Already have this region, so we're done */
return 0;
adjacent = memblock_addrs_adjacent(base, size, rgnbase, rgnsize);
+ /* Check if arch allows coalescing */
+ if (adjacent != 0 && type == &memblock.memory &&
+ !memblock_memory_can_coalesce(base, size, rgnbase, rgnsize))
+ break;
if (adjacent > 0) {
- rgn->region[i].base -= size;
- rgn->region[i].size += size;
+ type->regions[i].base -= size;
+ type->regions[i].size += size;
coalesced++;
break;
} else if (adjacent < 0) {
- rgn->region[i].size += size;
+ type->regions[i].size += size;
coalesced++;
break;
}
}
- if ((i < rgn->cnt - 1) && memblock_regions_adjacent(rgn, i, i+1)) {
- memblock_coalesce_regions(rgn, i, i+1);
+ /* If we plugged a hole, we may want to also coalesce with the
+ * next region
+ */
+ if ((i < type->cnt - 1) && memblock_regions_adjacent(type, i, i+1) &&
+ ((type != &memblock.memory || memblock_memory_can_coalesce(type->regions[i].base,
+ type->regions[i].size,
+ type->regions[i+1].base,
+ type->regions[i+1].size)))) {
+ memblock_coalesce_regions(type, i, i+1);
coalesced++;
}
if (coalesced)
return coalesced;
- if (rgn->cnt >= MAX_MEMBLOCK_REGIONS)
+
+ /* If we are out of space, we fail. It's too late to resize the array
+ * but then this shouldn't have happened in the first place.
+ */
+ if (WARN_ON(type->cnt >= type->max))
return -1;
/* Couldn't coalesce the MEMBLOCK, so add it to the sorted table. */
- for (i = rgn->cnt - 1; i >= 0; i--) {
- if (base < rgn->region[i].base) {
- rgn->region[i+1].base = rgn->region[i].base;
- rgn->region[i+1].size = rgn->region[i].size;
+ for (i = type->cnt - 1; i >= 0; i--) {
+ if (base < type->regions[i].base) {
+ type->regions[i+1].base = type->regions[i].base;
+ type->regions[i+1].size = type->regions[i].size;
} else {
- rgn->region[i+1].base = base;
- rgn->region[i+1].size = size;
+ type->regions[i+1].base = base;
+ type->regions[i+1].size = size;
break;
}
}
- if (base < rgn->region[0].base) {
- rgn->region[0].base = base;
- rgn->region[0].size = size;
+ if (base < type->regions[0].base) {
+ type->regions[0].base = base;
+ type->regions[0].size = size;
+ }
+ type->cnt++;
+
+ /* The array is full ? Try to resize it. If that fails, we undo
+ * our allocation and return an error
+ */
+ if (type->cnt == type->max && memblock_double_array(type)) {
+ type->cnt--;
+ return -1;
}
- rgn->cnt++;
return 0;
}
-long memblock_add(u64 base, u64 size)
+long __init_memblock memblock_add(phys_addr_t base, phys_addr_t size)
{
- struct memblock_region *_rgn = &memblock.memory;
-
- /* On pSeries LPAR systems, the first MEMBLOCK is our RMO region. */
- if (base == 0)
- memblock.rmo_size = size;
-
- return memblock_add_region(_rgn, base, size);
+ return memblock_add_region(&memblock.memory, base, size);
}
-static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size)
+static long __init_memblock __memblock_remove(struct memblock_type *type, phys_addr_t base, phys_addr_t size)
{
- u64 rgnbegin, rgnend;
- u64 end = base + size;
+ phys_addr_t rgnbegin, rgnend;
+ phys_addr_t end = base + size;
int i;
rgnbegin = rgnend = 0; /* supress gcc warnings */
/* Find the region where (base, size) belongs to */
- for (i=0; i < rgn->cnt; i++) {
- rgnbegin = rgn->region[i].base;
- rgnend = rgnbegin + rgn->region[i].size;
+ for (i=0; i < type->cnt; i++) {
+ rgnbegin = type->regions[i].base;
+ rgnend = rgnbegin + type->regions[i].size;
if ((rgnbegin <= base) && (end <= rgnend))
break;
}
/* Didn't find the region */
- if (i == rgn->cnt)
+ if (i == type->cnt)
return -1;
/* Check to see if we are removing entire region */
if ((rgnbegin == base) && (rgnend == end)) {
- memblock_remove_region(rgn, i);
+ memblock_remove_region(type, i);
return 0;
}
/* Check to see if region is matching at the front */
if (rgnbegin == base) {
- rgn->region[i].base = end;
- rgn->region[i].size -= size;
+ type->regions[i].base = end;
+ type->regions[i].size -= size;
return 0;
}
/* Check to see if the region is matching at the end */
if (rgnend == end) {
- rgn->region[i].size -= size;
+ type->regions[i].size -= size;
return 0;
}
@@ -249,208 +435,189 @@ static long __memblock_remove(struct memblock_region *rgn, u64 base, u64 size)
* We need to split the entry - adjust the current one to the
* beginging of the hole and add the region after hole.
*/
- rgn->region[i].size = base - rgn->region[i].base;
- return memblock_add_region(rgn, end, rgnend - end);
+ type->regions[i].size = base - type->regions[i].base;
+ return memblock_add_region(type, end, rgnend - end);
}
-long memblock_remove(u64 base, u64 size)
+long __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size)
{
return __memblock_remove(&memblock.memory, base, size);
}
-long __init memblock_free(u64 base, u64 size)
+long __init_memblock memblock_free(phys_addr_t base, phys_addr_t size)
{
return __memblock_remove(&memblock.reserved, base, size);
}
-long __init memblock_reserve(u64 base, u64 size)
+long __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
{
- struct memblock_region *_rgn = &memblock.reserved;
+ struct memblock_type *_rgn = &memblock.reserved;
BUG_ON(0 == size);
return memblock_add_region(_rgn, base, size);
}
-long memblock_overlaps_region(struct memblock_region *rgn, u64 base, u64 size)
+phys_addr_t __init __memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
- unsigned long i;
+ phys_addr_t found;
- for (i = 0; i < rgn->cnt; i++) {
- u64 rgnbase = rgn->region[i].base;
- u64 rgnsize = rgn->region[i].size;
- if (memblock_addrs_overlap(base, size, rgnbase, rgnsize))
- break;
- }
+ /* We align the size to limit fragmentation. Without this, a lot of
+ * small allocs quickly eat up the whole reserve array on sparc
+ */
+ size = memblock_align_up(size, align);
- return (i < rgn->cnt) ? i : -1;
+ found = memblock_find_base(size, align, 0, max_addr);
+ if (found != MEMBLOCK_ERROR &&
+ memblock_add_region(&memblock.reserved, found, size) >= 0)
+ return found;
+
+ return 0;
}
-static u64 memblock_align_down(u64 addr, u64 size)
+phys_addr_t __init memblock_alloc_base(phys_addr_t size, phys_addr_t align, phys_addr_t max_addr)
{
- return addr & ~(size - 1);
+ phys_addr_t alloc;
+
+ alloc = __memblock_alloc_base(size, align, max_addr);
+
+ if (alloc == 0)
+ panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
+ (unsigned long long) size, (unsigned long long) max_addr);
+
+ return alloc;
}
-static u64 memblock_align_up(u64 addr, u64 size)
+phys_addr_t __init memblock_alloc(phys_addr_t size, phys_addr_t align)
{
- return (addr + (size - 1)) & ~(size - 1);
+ return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE);
}
-static u64 __init memblock_alloc_nid_unreserved(u64 start, u64 end,
- u64 size, u64 align)
+
+/*
+ * Additional node-local allocators. Search for node memory is bottom up
+ * and walks memblock regions within that node bottom-up as well, but allocation
+ * within an memblock region is top-down. XXX I plan to fix that at some stage
+ *
+ * WARNING: Only available after early_node_map[] has been populated,
+ * on some architectures, that is after all the calls to add_active_range()
+ * have been done to populate it.
+ */
+
+phys_addr_t __weak __init memblock_nid_range(phys_addr_t start, phys_addr_t end, int *nid)
{
- u64 base, res_base;
- long j;
+#ifdef CONFIG_ARCH_POPULATES_NODE_MAP
+ /*
+ * This code originates from sparc which really wants use to walk by addresses
+ * and returns the nid. This is not very convenient for early_pfn_map[] users
+ * as the map isn't sorted yet, and it really wants to be walked by nid.
+ *
+ * For now, I implement the inefficient method below which walks the early
+ * map multiple times. Eventually we may want to use an ARCH config option
+ * to implement a completely different method for both case.
+ */
+ unsigned long start_pfn, end_pfn;
+ int i;
- base = memblock_align_down((end - size), align);
- while (start <= base) {
- j = memblock_overlaps_region(&memblock.reserved, base, size);
- if (j < 0) {
- /* this area isn't reserved, take it */
- if (memblock_add_region(&memblock.reserved, base, size) < 0)
- base = ~(u64)0;
- return base;
- }
- res_base = memblock.reserved.region[j].base;
- if (res_base < size)
- break;
- base = memblock_align_down(res_base - size, align);
+ for (i = 0; i < MAX_NUMNODES; i++) {
+ get_pfn_range_for_nid(i, &start_pfn, &end_pfn);
+ if (start < PFN_PHYS(start_pfn) || start >= PFN_PHYS(end_pfn))
+ continue;
+ *nid = i;
+ return min(end, PFN_PHYS(end_pfn));
}
+#endif
+ *nid = 0;
- return ~(u64)0;
+ return end;
}
-static u64 __init memblock_alloc_nid_region(struct memblock_property *mp,
- u64 (*nid_range)(u64, u64, int *),
- u64 size, u64 align, int nid)
+static phys_addr_t __init memblock_alloc_nid_region(struct memblock_region *mp,
+ phys_addr_t size,
+ phys_addr_t align, int nid)
{
- u64 start, end;
+ phys_addr_t start, end;
start = mp->base;
end = start + mp->size;
start = memblock_align_up(start, align);
while (start < end) {
- u64 this_end;
+ phys_addr_t this_end;
int this_nid;
- this_end = nid_range(start, end, &this_nid);
+ this_end = memblock_nid_range(start, end, &this_nid);
if (this_nid == nid) {
- u64 ret = memblock_alloc_nid_unreserved(start, this_end,
- size, align);
- if (ret != ~(u64)0)
+ phys_addr_t ret = memblock_find_region(start, this_end, size, align);
+ if (ret != MEMBLOCK_ERROR &&
+ memblock_add_region(&memblock.reserved, ret, size) >= 0)
return ret;
}
start = this_end;
}
- return ~(u64)0;
+ return MEMBLOCK_ERROR;
}
-u64 __init memblock_alloc_nid(u64 size, u64 align, int nid,
- u64 (*nid_range)(u64 start, u64 end, int *nid))
+phys_addr_t __init memblock_alloc_nid(phys_addr_t size, phys_addr_t align, int nid)
{
- struct memblock_region *mem = &memblock.memory;
+ struct memblock_type *mem = &memblock.memory;
int i;
BUG_ON(0 == size);
+ /* We align the size to limit fragmentation. Without this, a lot of
+ * small allocs quickly eat up the whole reserve array on sparc
+ */
size = memblock_align_up(size, align);
+ /* We do a bottom-up search for a region with the right
+ * nid since that's easier considering how memblock_nid_range()
+ * works
+ */
for (i = 0; i < mem->cnt; i++) {
- u64 ret = memblock_alloc_nid_region(&mem->region[i],
- nid_range,
+ phys_addr_t ret = memblock_alloc_nid_region(&mem->regions[i],
size, align, nid);
- if (ret != ~(u64)0)
+ if (ret != MEMBLOCK_ERROR)
return ret;
}
- return memblock_alloc(size, align);
-}
-
-u64 __init memblock_alloc(u64 size, u64 align)
-{
- return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
+ return 0;
}
-u64 __init memblock_alloc_base(u64 size, u64 align, u64 max_addr)
+phys_addr_t __init memblock_alloc_try_nid(phys_addr_t size, phys_addr_t align, int nid)
{
- u64 alloc;
-
- alloc = __memblock_alloc_base(size, align, max_addr);
+ phys_addr_t res = memblock_alloc_nid(size, align, nid);
- if (alloc == 0)
- panic("ERROR: Failed to allocate 0x%llx bytes below 0x%llx.\n",
- (unsigned long long) size, (unsigned long long) max_addr);
-
- return alloc;
+ if (res)
+ return res;
+ return memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ANYWHERE);
}
-u64 __init __memblock_alloc_base(u64 size, u64 align, u64 max_addr)
-{
- long i, j;
- u64 base = 0;
- u64 res_base;
-
- BUG_ON(0 == size);
- size = memblock_align_up(size, align);
-
- /* On some platforms, make sure we allocate lowmem */
- /* Note that MEMBLOCK_REAL_LIMIT may be MEMBLOCK_ALLOC_ANYWHERE */
- if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
- max_addr = MEMBLOCK_REAL_LIMIT;
-
- for (i = memblock.memory.cnt - 1; i >= 0; i--) {
- u64 memblockbase = memblock.memory.region[i].base;
- u64 memblocksize = memblock.memory.region[i].size;
-
- if (memblocksize < size)
- continue;
- if (max_addr == MEMBLOCK_ALLOC_ANYWHERE)
- base = memblock_align_down(memblockbase + memblocksize - size, align);
- else if (memblockbase < max_addr) {
- base = min(memblockbase + memblocksize, max_addr);
- base = memblock_align_down(base - size, align);
- } else
- continue;
-
- while (base && memblockbase <= base) {
- j = memblock_overlaps_region(&memblock.reserved, base, size);
- if (j < 0) {
- /* this area isn't reserved, take it */
- if (memblock_add_region(&memblock.reserved, base, size) < 0)
- return 0;
- return base;
- }
- res_base = memblock.reserved.region[j].base;
- if (res_base < size)
- break;
- base = memblock_align_down(res_base - size, align);
- }
- }
- return 0;
-}
+/*
+ * Remaining API functions
+ */
/* You must call memblock_analyze() before this. */
-u64 __init memblock_phys_mem_size(void)
+phys_addr_t __init memblock_phys_mem_size(void)
{
- return memblock.memory.size;
+ return memblock.memory_size;
}
-u64 memblock_end_of_DRAM(void)
+phys_addr_t __init_memblock memblock_end_of_DRAM(void)
{
int idx = memblock.memory.cnt - 1;
- return (memblock.memory.region[idx].base + memblock.memory.region[idx].size);
+ return (memblock.memory.regions[idx].base + memblock.memory.regions[idx].size);
}
/* You must call memblock_analyze() after this. */
-void __init memblock_enforce_memory_limit(u64 memory_limit)
+void __init memblock_enforce_memory_limit(phys_addr_t memory_limit)
{
unsigned long i;
- u64 limit;
- struct memblock_property *p;
+ phys_addr_t limit;
+ struct memblock_region *p;
if (!memory_limit)
return;
@@ -458,24 +625,21 @@ void __init memblock_enforce_memory_limit(u64 memory_limit)
/* Truncate the memblock regions to satisfy the memory limit. */
limit = memory_limit;
for (i = 0; i < memblock.memory.cnt; i++) {
- if (limit > memblock.memory.region[i].size) {
- limit -= memblock.memory.region[i].size;
+ if (limit > memblock.memory.regions[i].size) {
+ limit -= memblock.memory.regions[i].size;
continue;
}
- memblock.memory.region[i].size = limit;
+ memblock.memory.regions[i].size = limit;
memblock.memory.cnt = i + 1;
break;
}
- if (memblock.memory.region[0].size < memblock.rmo_size)
- memblock.rmo_size = memblock.memory.region[0].size;
-
memory_limit = memblock_end_of_DRAM();
/* And truncate any reserves above the limit also. */
for (i = 0; i < memblock.reserved.cnt; i++) {
- p = &memblock.reserved.region[i];
+ p = &memblock.reserved.regions[i];
if (p->base > memory_limit)
p->size = 0;
@@ -489,53 +653,190 @@ void __init memblock_enforce_memory_limit(u64 memory_limit)
}
}
-int __init memblock_is_reserved(u64 addr)
+static int __init_memblock memblock_search(struct memblock_type *type, phys_addr_t addr)
+{
+ unsigned int left = 0, right = type->cnt;
+
+ do {
+ unsigned int mid = (right + left) / 2;
+
+ if (addr < type->regions[mid].base)
+ right = mid;
+ else if (addr >= (type->regions[mid].base +
+ type->regions[mid].size))
+ left = mid + 1;
+ else
+ return mid;
+ } while (left < right);
+ return -1;
+}
+
+int __init memblock_is_reserved(phys_addr_t addr)
+{
+ return memblock_search(&memblock.reserved, addr) != -1;
+}
+
+int __init_memblock memblock_is_memory(phys_addr_t addr)
+{
+ return memblock_search(&memblock.memory, addr) != -1;
+}
+
+int __init_memblock memblock_is_region_memory(phys_addr_t base, phys_addr_t size)
+{
+ int idx = memblock_search(&memblock.reserved, base);
+
+ if (idx == -1)
+ return 0;
+ return memblock.reserved.regions[idx].base <= base &&
+ (memblock.reserved.regions[idx].base +
+ memblock.reserved.regions[idx].size) >= (base + size);
+}
+
+int __init_memblock memblock_is_region_reserved(phys_addr_t base, phys_addr_t size)
+{
+ return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+}
+
+
+void __init_memblock memblock_set_current_limit(phys_addr_t limit)
{
+ memblock.current_limit = limit;
+}
+
+static void __init_memblock memblock_dump(struct memblock_type *region, char *name)
+{
+ unsigned long long base, size;
int i;
- for (i = 0; i < memblock.reserved.cnt; i++) {
- u64 upper = memblock.reserved.region[i].base +
- memblock.reserved.region[i].size - 1;
- if ((addr >= memblock.reserved.region[i].base) && (addr <= upper))
- return 1;
+ pr_info(" %s.cnt = 0x%lx\n", name, region->cnt);
+
+ for (i = 0; i < region->cnt; i++) {
+ base = region->regions[i].base;
+ size = region->regions[i].size;
+
+ pr_info(" %s[%#x]\t[%#016llx-%#016llx], %#llx bytes\n",
+ name, i, base, base + size - 1, size);
}
- return 0;
}
-int memblock_is_region_reserved(u64 base, u64 size)
+void __init_memblock memblock_dump_all(void)
{
- return memblock_overlaps_region(&memblock.reserved, base, size) >= 0;
+ if (!memblock_debug)
+ return;
+
+ pr_info("MEMBLOCK configuration:\n");
+ pr_info(" memory size = 0x%llx\n", (unsigned long long)memblock.memory_size);
+
+ memblock_dump(&memblock.memory, "memory");
+ memblock_dump(&memblock.reserved, "reserved");
}
-/*
- * Given a <base, len>, find which memory regions belong to this range.
- * Adjust the request and return a contiguous chunk.
- */
-int memblock_find(struct memblock_property *res)
+void __init memblock_analyze(void)
{
int i;
- u64 rstart, rend;
- rstart = res->base;
- rend = rstart + res->size - 1;
+ /* Check marker in the unused last array entry */
+ WARN_ON(memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS].base
+ != (phys_addr_t)RED_INACTIVE);
+ WARN_ON(memblock_reserved_init_regions[INIT_MEMBLOCK_REGIONS].base
+ != (phys_addr_t)RED_INACTIVE);
+
+ memblock.memory_size = 0;
+
+ for (i = 0; i < memblock.memory.cnt; i++)
+ memblock.memory_size += memblock.memory.regions[i].size;
+
+ /* We allow resizing from there */
+ memblock_can_resize = 1;
+}
+
+void __init memblock_init(void)
+{
+ static int init_done __initdata = 0;
+
+ if (init_done)
+ return;
+ init_done = 1;
+
+ /* Hookup the initial arrays */
+ memblock.memory.regions = memblock_memory_init_regions;
+ memblock.memory.max = INIT_MEMBLOCK_REGIONS;
+ memblock.reserved.regions = memblock_reserved_init_regions;
+ memblock.reserved.max = INIT_MEMBLOCK_REGIONS;
+
+ /* Write a marker in the unused last array entry */
+ memblock.memory.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
+ memblock.reserved.regions[INIT_MEMBLOCK_REGIONS].base = (phys_addr_t)RED_INACTIVE;
+
+ /* Create a dummy zero size MEMBLOCK which will get coalesced away later.
+ * This simplifies the memblock_add() code below...
+ */
+ memblock.memory.regions[0].base = 0;
+ memblock.memory.regions[0].size = 0;
+ memblock.memory.cnt = 1;
+
+ /* Ditto. */
+ memblock.reserved.regions[0].base = 0;
+ memblock.reserved.regions[0].size = 0;
+ memblock.reserved.cnt = 1;
+
+ memblock.current_limit = MEMBLOCK_ALLOC_ANYWHERE;
+}
+
+static int __init early_memblock(char *p)
+{
+ if (p && strstr(p, "debug"))
+ memblock_debug = 1;
+ return 0;
+}
+early_param("memblock", early_memblock);
+
+#if defined(CONFIG_DEBUG_FS) && !defined(ARCH_DISCARD_MEMBLOCK)
+
+static int memblock_debug_show(struct seq_file *m, void *private)
+{
+ struct memblock_type *type = m->private;
+ struct memblock_region *reg;
+ int i;
+
+ for (i = 0; i < type->cnt; i++) {
+ reg = &type->regions[i];
+ seq_printf(m, "%4d: ", i);
+ if (sizeof(phys_addr_t) == 4)
+ seq_printf(m, "0x%08lx..0x%08lx\n",
+ (unsigned long)reg->base,
+ (unsigned long)(reg->base + reg->size - 1));
+ else
+ seq_printf(m, "0x%016llx..0x%016llx\n",
+ (unsigned long long)reg->base,
+ (unsigned long long)(reg->base + reg->size - 1));
- for (i = 0; i < memblock.memory.cnt; i++) {
- u64 start = memblock.memory.region[i].base;
- u64 end = start + memblock.memory.region[i].size - 1;
-
- if (start > rend)
- return -1;
-
- if ((end >= rstart) && (start < rend)) {
- /* adjust the request */
- if (rstart < start)
- rstart = start;
- if (rend > end)
- rend = end;
- res->base = rstart;
- res->size = rend - rstart + 1;
- return 0;
- }
}
- return -1;
+ return 0;
+}
+
+static int memblock_debug_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, memblock_debug_show, inode->i_private);
}
+
+static const struct file_operations memblock_debug_fops = {
+ .open = memblock_debug_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int __init memblock_init_debugfs(void)
+{
+ struct dentry *root = debugfs_create_dir("memblock", NULL);
+ if (!root)
+ return -ENXIO;
+ debugfs_create_file("memory", S_IRUGO, root, &memblock.memory, &memblock_debug_fops);
+ debugfs_create_file("reserved", S_IRUGO, root, &memblock.reserved, &memblock_debug_fops);
+
+ return 0;
+}
+__initcall(memblock_init_debugfs);
+
+#endif /* CONFIG_DEBUG_FS */
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0576e9e..9be3cf8 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -47,6 +47,7 @@
#include <linux/mm_inline.h>
#include <linux/page_cgroup.h>
#include <linux/cpu.h>
+#include <linux/oom.h>
#include "internal.h"
#include <asm/uaccess.h>
@@ -268,6 +269,7 @@ enum move_type {
/* "mc" and its members are protected by cgroup_mutex */
static struct move_charge_struct {
+ spinlock_t lock; /* for from, to, moving_task */
struct mem_cgroup *from;
struct mem_cgroup *to;
unsigned long precharge;
@@ -276,6 +278,7 @@ static struct move_charge_struct {
struct task_struct *moving_task; /* a task moving charges */
wait_queue_head_t waitq; /* a waitq for other context */
} mc = {
+ .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
.waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
};
@@ -836,12 +839,13 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
{
int ret;
struct mem_cgroup *curr = NULL;
+ struct task_struct *p;
- task_lock(task);
- rcu_read_lock();
- curr = try_get_mem_cgroup_from_mm(task->mm);
- rcu_read_unlock();
- task_unlock(task);
+ p = find_lock_task_mm(task);
+ if (!p)
+ return 0;
+ curr = try_get_mem_cgroup_from_mm(p->mm);
+ task_unlock(p);
if (!curr)
return 0;
/*
@@ -915,7 +919,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
struct zone *zone,
enum lru_list lru)
{
- int nid = zone->zone_pgdat->node_id;
+ int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
@@ -925,7 +929,7 @@ unsigned long mem_cgroup_zone_nr_pages(struct mem_cgroup *memcg,
struct zone_reclaim_stat *mem_cgroup_get_reclaim_stat(struct mem_cgroup *memcg,
struct zone *zone)
{
- int nid = zone->zone_pgdat->node_id;
+ int nid = zone_to_nid(zone);
int zid = zone_idx(zone);
struct mem_cgroup_per_zone *mz = mem_cgroup_zoneinfo(memcg, nid, zid);
@@ -970,7 +974,7 @@ unsigned long mem_cgroup_isolate_pages(unsigned long nr_to_scan,
LIST_HEAD(pc_list);
struct list_head *src;
struct page_cgroup *pc, *tmp;
- int nid = z->zone_pgdat->node_id;
+ int nid = zone_to_nid(z);
int zid = zone_idx(z);
struct mem_cgroup_per_zone *mz;
int lru = LRU_FILE * file + active;
@@ -1047,6 +1051,47 @@ static unsigned int get_swappiness(struct mem_cgroup *memcg)
return swappiness;
}
+/* A routine for testing mem is not under move_account */
+
+static bool mem_cgroup_under_move(struct mem_cgroup *mem)
+{
+ struct mem_cgroup *from;
+ struct mem_cgroup *to;
+ bool ret = false;
+ /*
+ * Unlike task_move routines, we access mc.to, mc.from not under
+ * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
+ */
+ spin_lock(&mc.lock);
+ from = mc.from;
+ to = mc.to;
+ if (!from)
+ goto unlock;
+ if (from == mem || to == mem
+ || (mem->use_hierarchy && css_is_ancestor(&from->css, &mem->css))
+ || (mem->use_hierarchy && css_is_ancestor(&to->css, &mem->css)))
+ ret = true;
+unlock:
+ spin_unlock(&mc.lock);
+ return ret;
+}
+
+static bool mem_cgroup_wait_acct_move(struct mem_cgroup *mem)
+{
+ if (mc.moving_task && current != mc.moving_task) {
+ if (mem_cgroup_under_move(mem)) {
+ DEFINE_WAIT(wait);
+ prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
+ /* moving charge context might have finished. */
+ if (mc.moving_task)
+ schedule();
+ finish_wait(&mc.waitq, &wait);
+ return true;
+ }
+ }
+ return false;
+}
+
static int mem_cgroup_count_children_cb(struct mem_cgroup *mem, void *data)
{
int *val = data;
@@ -1255,8 +1300,7 @@ static int mem_cgroup_hierarchical_reclaim(struct mem_cgroup *root_mem,
/* we use swappiness of local cgroup */
if (check_soft)
ret = mem_cgroup_shrink_node_zone(victim, gfp_mask,
- noswap, get_swappiness(victim), zone,
- zone->zone_pgdat->node_id);
+ noswap, get_swappiness(victim), zone);
else
ret = try_to_free_mem_cgroup_pages(victim, gfp_mask,
noswap, get_swappiness(victim));
@@ -1363,7 +1407,7 @@ static void memcg_wakeup_oom(struct mem_cgroup *mem)
static void memcg_oom_recover(struct mem_cgroup *mem)
{
- if (atomic_read(&mem->oom_lock))
+ if (mem && atomic_read(&mem->oom_lock))
memcg_wakeup_oom(mem);
}
@@ -1575,16 +1619,83 @@ static int __cpuinit memcg_stock_cpu_callback(struct notifier_block *nb,
return NOTIFY_OK;
}
+
+/* See __mem_cgroup_try_charge() for details */
+enum {
+ CHARGE_OK, /* success */
+ CHARGE_RETRY, /* need to retry but retry is not bad */
+ CHARGE_NOMEM, /* we can't do more. return -ENOMEM */
+ CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */
+ CHARGE_OOM_DIE, /* the current is killed because of OOM */
+};
+
+static int __mem_cgroup_do_charge(struct mem_cgroup *mem, gfp_t gfp_mask,
+ int csize, bool oom_check)
+{
+ struct mem_cgroup *mem_over_limit;
+ struct res_counter *fail_res;
+ unsigned long flags = 0;
+ int ret;
+
+ ret = res_counter_charge(&mem->res, csize, &fail_res);
+
+ if (likely(!ret)) {
+ if (!do_swap_account)
+ return CHARGE_OK;
+ ret = res_counter_charge(&mem->memsw, csize, &fail_res);
+ if (likely(!ret))
+ return CHARGE_OK;
+
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw);
+ flags |= MEM_CGROUP_RECLAIM_NOSWAP;
+ } else
+ mem_over_limit = mem_cgroup_from_res_counter(fail_res, res);
+
+ if (csize > PAGE_SIZE) /* change csize and retry */
+ return CHARGE_RETRY;
+
+ if (!(gfp_mask & __GFP_WAIT))
+ return CHARGE_WOULDBLOCK;
+
+ ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
+ gfp_mask, flags);
+ /*
+ * try_to_free_mem_cgroup_pages() might not give us a full
+ * picture of reclaim. Some pages are reclaimed and might be
+ * moved to swap cache or just unmapped from the cgroup.
+ * Check the limit again to see if the reclaim reduced the
+ * current usage of the cgroup before giving up
+ */
+ if (ret || mem_cgroup_check_under_limit(mem_over_limit))
+ return CHARGE_RETRY;
+
+ /*
+ * At task move, charge accounts can be doubly counted. So, it's
+ * better to wait until the end of task_move if something is going on.
+ */
+ if (mem_cgroup_wait_acct_move(mem_over_limit))
+ return CHARGE_RETRY;
+
+ /* If we don't need to call oom-killer at el, return immediately */
+ if (!oom_check)
+ return CHARGE_NOMEM;
+ /* check OOM */
+ if (!mem_cgroup_handle_oom(mem_over_limit, gfp_mask))
+ return CHARGE_OOM_DIE;
+
+ return CHARGE_RETRY;
+}
+
/*
* Unlike exported interface, "oom" parameter is added. if oom==true,
* oom-killer can be invoked.
*/
static int __mem_cgroup_try_charge(struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
+ gfp_t gfp_mask, struct mem_cgroup **memcg, bool oom)
{
- struct mem_cgroup *mem, *mem_over_limit;
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- struct res_counter *fail_res;
+ int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ struct mem_cgroup *mem = NULL;
+ int ret;
int csize = CHARGE_SIZE;
/*
@@ -1602,126 +1713,108 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
* thread group leader migrates. It's possible that mm is not
* set, if so charge the init_mm (happens for pagecache usage).
*/
- mem = *memcg;
- if (likely(!mem)) {
- mem = try_get_mem_cgroup_from_mm(mm);
- *memcg = mem;
- } else {
- css_get(&mem->css);
- }
- if (unlikely(!mem))
- return 0;
-
- VM_BUG_ON(css_is_removed(&mem->css));
- if (mem_cgroup_is_root(mem))
- goto done;
-
- while (1) {
- int ret = 0;
- unsigned long flags = 0;
-
+ if (!*memcg && !mm)
+ goto bypass;
+again:
+ if (*memcg) { /* css should be a valid one */
+ mem = *memcg;
+ VM_BUG_ON(css_is_removed(&mem->css));
+ if (mem_cgroup_is_root(mem))
+ goto done;
if (consume_stock(mem))
goto done;
+ css_get(&mem->css);
+ } else {
+ struct task_struct *p;
- ret = res_counter_charge(&mem->res, csize, &fail_res);
- if (likely(!ret)) {
- if (!do_swap_account)
- break;
- ret = res_counter_charge(&mem->memsw, csize, &fail_res);
- if (likely(!ret))
- break;
- /* mem+swap counter fails */
- res_counter_uncharge(&mem->res, csize);
- flags |= MEM_CGROUP_RECLAIM_NOSWAP;
- mem_over_limit = mem_cgroup_from_res_counter(fail_res,
- memsw);
- } else
- /* mem counter fails */
- mem_over_limit = mem_cgroup_from_res_counter(fail_res,
- res);
-
- /* reduce request size and retry */
- if (csize > PAGE_SIZE) {
- csize = PAGE_SIZE;
- continue;
- }
- if (!(gfp_mask & __GFP_WAIT))
- goto nomem;
-
- ret = mem_cgroup_hierarchical_reclaim(mem_over_limit, NULL,
- gfp_mask, flags);
- if (ret)
- continue;
-
+ rcu_read_lock();
+ p = rcu_dereference(mm->owner);
+ VM_BUG_ON(!p);
/*
- * try_to_free_mem_cgroup_pages() might not give us a full
- * picture of reclaim. Some pages are reclaimed and might be
- * moved to swap cache or just unmapped from the cgroup.
- * Check the limit again to see if the reclaim reduced the
- * current usage of the cgroup before giving up
- *
+ * because we don't have task_lock(), "p" can exit while
+ * we're here. In that case, "mem" can point to root
+ * cgroup but never be NULL. (and task_struct itself is freed
+ * by RCU, cgroup itself is RCU safe.) Then, we have small
+ * risk here to get wrong cgroup. But such kind of mis-account
+ * by race always happens because we don't have cgroup_mutex().
+ * It's overkill and we allow that small race, here.
*/
- if (mem_cgroup_check_under_limit(mem_over_limit))
- continue;
-
- /* try to avoid oom while someone is moving charge */
- if (mc.moving_task && current != mc.moving_task) {
- struct mem_cgroup *from, *to;
- bool do_continue = false;
+ mem = mem_cgroup_from_task(p);
+ VM_BUG_ON(!mem);
+ if (mem_cgroup_is_root(mem)) {
+ rcu_read_unlock();
+ goto done;
+ }
+ if (consume_stock(mem)) {
/*
- * There is a small race that "from" or "to" can be
- * freed by rmdir, so we use css_tryget().
+ * It seems dagerous to access memcg without css_get().
+ * But considering how consume_stok works, it's not
+ * necessary. If consume_stock success, some charges
+ * from this memcg are cached on this cpu. So, we
+ * don't need to call css_get()/css_tryget() before
+ * calling consume_stock().
*/
- from = mc.from;
- to = mc.to;
- if (from && css_tryget(&from->css)) {
- if (mem_over_limit->use_hierarchy)
- do_continue = css_is_ancestor(
- &from->css,
- &mem_over_limit->css);
- else
- do_continue = (from == mem_over_limit);
- css_put(&from->css);
- }
- if (!do_continue && to && css_tryget(&to->css)) {
- if (mem_over_limit->use_hierarchy)
- do_continue = css_is_ancestor(
- &to->css,
- &mem_over_limit->css);
- else
- do_continue = (to == mem_over_limit);
- css_put(&to->css);
- }
- if (do_continue) {
- DEFINE_WAIT(wait);
- prepare_to_wait(&mc.waitq, &wait,
- TASK_INTERRUPTIBLE);
- /* moving charge context might have finished. */
- if (mc.moving_task)
- schedule();
- finish_wait(&mc.waitq, &wait);
- continue;
- }
+ rcu_read_unlock();
+ goto done;
+ }
+ /* after here, we may be blocked. we need to get refcnt */
+ if (!css_tryget(&mem->css)) {
+ rcu_read_unlock();
+ goto again;
}
+ rcu_read_unlock();
+ }
- if (!nr_retries--) {
- if (!oom)
+ do {
+ bool oom_check;
+
+ /* If killed, bypass charge */
+ if (fatal_signal_pending(current)) {
+ css_put(&mem->css);
+ goto bypass;
+ }
+
+ oom_check = false;
+ if (oom && !nr_oom_retries) {
+ oom_check = true;
+ nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ }
+
+ ret = __mem_cgroup_do_charge(mem, gfp_mask, csize, oom_check);
+
+ switch (ret) {
+ case CHARGE_OK:
+ break;
+ case CHARGE_RETRY: /* not in OOM situation but retry */
+ csize = PAGE_SIZE;
+ css_put(&mem->css);
+ mem = NULL;
+ goto again;
+ case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */
+ css_put(&mem->css);
+ goto nomem;
+ case CHARGE_NOMEM: /* OOM routine works */
+ if (!oom) {
+ css_put(&mem->css);
goto nomem;
- if (mem_cgroup_handle_oom(mem_over_limit, gfp_mask)) {
- nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
- continue;
}
- /* When we reach here, current task is dying .*/
+ /* If oom, we never return -ENOMEM */
+ nr_oom_retries--;
+ break;
+ case CHARGE_OOM_DIE: /* Killed by OOM Killer */
css_put(&mem->css);
goto bypass;
}
- }
+ } while (ret != CHARGE_OK);
+
if (csize > PAGE_SIZE)
refill_stock(mem, csize - PAGE_SIZE);
+ css_put(&mem->css);
done:
+ *memcg = mem;
return 0;
nomem:
- css_put(&mem->css);
+ *memcg = NULL;
return -ENOMEM;
bypass:
*memcg = NULL;
@@ -1740,11 +1833,7 @@ static void __mem_cgroup_cancel_charge(struct mem_cgroup *mem,
res_counter_uncharge(&mem->res, PAGE_SIZE * count);
if (do_swap_account)
res_counter_uncharge(&mem->memsw, PAGE_SIZE * count);
- VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
- WARN_ON_ONCE(count > INT_MAX);
- __css_put(&mem->css, (int)count);
}
- /* we don't need css_put for root */
}
static void mem_cgroup_cancel_charge(struct mem_cgroup *mem)
@@ -1972,10 +2061,9 @@ out:
* < 0 if the cgroup is over its limit
*/
static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, enum charge_type ctype,
- struct mem_cgroup *memcg)
+ gfp_t gfp_mask, enum charge_type ctype)
{
- struct mem_cgroup *mem;
+ struct mem_cgroup *mem = NULL;
struct page_cgroup *pc;
int ret;
@@ -1985,7 +2073,6 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
return 0;
prefetchw(pc);
- mem = memcg;
ret = __mem_cgroup_try_charge(mm, gfp_mask, &mem, true);
if (ret || !mem)
return ret;
@@ -2013,7 +2100,7 @@ int mem_cgroup_newpage_charge(struct page *page,
if (unlikely(!mm))
mm = &init_mm;
return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_MAPPED, NULL);
+ MEM_CGROUP_CHARGE_TYPE_MAPPED);
}
static void
@@ -2023,7 +2110,6 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
gfp_t gfp_mask)
{
- struct mem_cgroup *mem = NULL;
int ret;
if (mem_cgroup_disabled())
@@ -2044,7 +2130,6 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
if (!(gfp_mask & __GFP_WAIT)) {
struct page_cgroup *pc;
-
pc = lookup_page_cgroup(page);
if (!pc)
return 0;
@@ -2056,22 +2141,24 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm,
unlock_page_cgroup(pc);
}
- if (unlikely(!mm && !mem))
+ if (unlikely(!mm))
mm = &init_mm;
if (page_is_file_cache(page))
return mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_CACHE, NULL);
+ MEM_CGROUP_CHARGE_TYPE_CACHE);
/* shmem */
if (PageSwapCache(page)) {
+ struct mem_cgroup *mem = NULL;
+
ret = mem_cgroup_try_charge_swapin(mm, page, gfp_mask, &mem);
if (!ret)
__mem_cgroup_commit_charge_swapin(page, mem,
MEM_CGROUP_CHARGE_TYPE_SHMEM);
} else
ret = mem_cgroup_charge_common(page, mm, gfp_mask,
- MEM_CGROUP_CHARGE_TYPE_SHMEM, mem);
+ MEM_CGROUP_CHARGE_TYPE_SHMEM);
return ret;
}
@@ -2107,7 +2194,6 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm,
goto charge_cur_mm;
*ptr = mem;
ret = __mem_cgroup_try_charge(NULL, mask, ptr, true);
- /* drop extra refcnt from tryget */
css_put(&mem->css);
return ret;
charge_cur_mm:
@@ -2238,7 +2324,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
{
struct page_cgroup *pc;
struct mem_cgroup *mem = NULL;
- struct mem_cgroup_per_zone *mz;
if (mem_cgroup_disabled())
return NULL;
@@ -2278,10 +2363,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
break;
}
- if (!mem_cgroup_is_root(mem))
- __do_uncharge(mem, ctype);
- if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
- mem_cgroup_swap_statistics(mem, true);
mem_cgroup_charge_statistics(mem, pc, false);
ClearPageCgroupUsed(pc);
@@ -2292,13 +2373,18 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
* special functions.
*/
- mz = page_cgroup_zoneinfo(pc);
unlock_page_cgroup(pc);
-
+ /*
+ * even after unlock, we have mem->res.usage here and this memcg
+ * will never be freed.
+ */
memcg_check_events(mem, page);
- /* at swapout, this memcg will be accessed to record to swap */
- if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
- css_put(&mem->css);
+ if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
+ mem_cgroup_swap_statistics(mem, true);
+ mem_cgroup_get(mem);
+ }
+ if (!mem_cgroup_is_root(mem))
+ __do_uncharge(mem, ctype);
return mem;
@@ -2385,13 +2471,12 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
memcg = __mem_cgroup_uncharge_common(page, ctype);
- /* record memcg information */
- if (do_swap_account && swapout && memcg) {
+ /*
+ * record memcg information, if swapout && memcg != NULL,
+ * mem_cgroup_get() was called in uncharge().
+ */
+ if (do_swap_account && swapout && memcg)
swap_cgroup_record(ent, css_id(&memcg->css));
- mem_cgroup_get(memcg);
- }
- if (swapout && memcg)
- css_put(&memcg->css);
}
#endif
@@ -2469,7 +2554,6 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
*/
if (!mem_cgroup_is_root(to))
res_counter_uncharge(&to->res, PAGE_SIZE);
- css_put(&to->css);
}
return 0;
}
@@ -2604,11 +2688,8 @@ void mem_cgroup_end_migration(struct mem_cgroup *mem,
ClearPageCgroupMigration(pc);
unlock_page_cgroup(pc);
- if (unused != oldpage)
- pc = lookup_page_cgroup(unused);
__mem_cgroup_uncharge_common(unused, MEM_CGROUP_CHARGE_TYPE_FORCE);
- pc = lookup_page_cgroup(used);
/*
* If a page is a file cache, radix-tree replacement is very atomic
* and we can skip this check. When it was an Anon page, its mapcount
@@ -2784,8 +2865,7 @@ static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg,
}
unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
- gfp_t gfp_mask, int nid,
- int zid)
+ gfp_t gfp_mask)
{
unsigned long nr_reclaimed = 0;
struct mem_cgroup_per_zone *mz, *next_mz = NULL;
@@ -2797,7 +2877,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order,
if (order > 0)
return 0;
- mctz = soft_limit_tree_node_zone(nid, zid);
+ mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone));
/*
* This loop can run a while, specially if mem_cgroup's continuously
* keep exceeding their soft limit and putting the system under
@@ -3507,9 +3587,13 @@ unlock:
static void mem_cgroup_threshold(struct mem_cgroup *memcg)
{
- __mem_cgroup_threshold(memcg, false);
- if (do_swap_account)
- __mem_cgroup_threshold(memcg, true);
+ while (memcg) {
+ __mem_cgroup_threshold(memcg, false);
+ if (do_swap_account)
+ __mem_cgroup_threshold(memcg, true);
+
+ memcg = parent_mem_cgroup(memcg);
+ }
}
static int compare_thresholds(const void *a, const void *b)
@@ -3752,8 +3836,6 @@ static int mem_cgroup_oom_control_read(struct cgroup *cgrp,
return 0;
}
-/*
- */
static int mem_cgroup_oom_control_write(struct cgroup *cgrp,
struct cftype *cft, u64 val)
{
@@ -4173,9 +4255,6 @@ static int mem_cgroup_do_precharge(unsigned long count)
goto one_by_one;
}
mc.precharge += count;
- VM_BUG_ON(test_bit(CSS_ROOT, &mem->css.flags));
- WARN_ON_ONCE(count > INT_MAX);
- __css_get(&mem->css, (int)count);
return ret;
}
one_by_one:
@@ -4393,11 +4472,13 @@ static int mem_cgroup_precharge_mc(struct mm_struct *mm)
static void mem_cgroup_clear_mc(void)
{
+ struct mem_cgroup *from = mc.from;
+ struct mem_cgroup *to = mc.to;
+
/* we must uncharge all the leftover precharges from mc.to */
if (mc.precharge) {
__mem_cgroup_cancel_charge(mc.to, mc.precharge);
mc.precharge = 0;
- memcg_oom_recover(mc.to);
}
/*
* we didn't uncharge from mc.from at mem_cgroup_move_account(), so
@@ -4406,11 +4487,9 @@ static void mem_cgroup_clear_mc(void)
if (mc.moved_charge) {
__mem_cgroup_cancel_charge(mc.from, mc.moved_charge);
mc.moved_charge = 0;
- memcg_oom_recover(mc.from);
}
/* we must fixup refcnts and charges */
if (mc.moved_swap) {
- WARN_ON_ONCE(mc.moved_swap > INT_MAX);
/* uncharge swap account from the old cgroup */
if (!mem_cgroup_is_root(mc.from))
res_counter_uncharge(&mc.from->memsw,
@@ -4424,16 +4503,18 @@ static void mem_cgroup_clear_mc(void)
*/
res_counter_uncharge(&mc.to->res,
PAGE_SIZE * mc.moved_swap);
- VM_BUG_ON(test_bit(CSS_ROOT, &mc.to->css.flags));
- __css_put(&mc.to->css, mc.moved_swap);
}
/* we've already done mem_cgroup_get(mc.to) */
mc.moved_swap = 0;
}
+ spin_lock(&mc.lock);
mc.from = NULL;
mc.to = NULL;
mc.moving_task = NULL;
+ spin_unlock(&mc.lock);
+ memcg_oom_recover(from);
+ memcg_oom_recover(to);
wake_up_all(&mc.waitq);
}
@@ -4462,12 +4543,14 @@ static int mem_cgroup_can_attach(struct cgroup_subsys *ss,
VM_BUG_ON(mc.moved_charge);
VM_BUG_ON(mc.moved_swap);
VM_BUG_ON(mc.moving_task);
+ spin_lock(&mc.lock);
mc.from = from;
mc.to = mem;
mc.precharge = 0;
mc.moved_charge = 0;
mc.moved_swap = 0;
mc.moving_task = current;
+ spin_unlock(&mc.lock);
ret = mem_cgroup_precharge_mc(mm);
if (ret)
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 6b44e52..757f6b0 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -46,6 +46,7 @@
#include <linux/suspend.h>
#include <linux/slab.h>
#include <linux/swapops.h>
+#include <linux/hugetlb.h>
#include "internal.h"
int sysctl_memory_failure_early_kill __read_mostly = 0;
@@ -182,7 +183,7 @@ EXPORT_SYMBOL_GPL(hwpoison_filter);
* signal.
*/
static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
- unsigned long pfn)
+ unsigned long pfn, struct page *page)
{
struct siginfo si;
int ret;
@@ -197,7 +198,7 @@ static int kill_proc_ao(struct task_struct *t, unsigned long addr, int trapno,
#ifdef __ARCH_SI_TRAPNO
si.si_trapno = trapno;
#endif
- si.si_addr_lsb = PAGE_SHIFT;
+ si.si_addr_lsb = compound_order(compound_head(page)) + PAGE_SHIFT;
/*
* Don't use force here, it's convenient if the signal
* can be temporarily blocked.
@@ -234,7 +235,7 @@ void shake_page(struct page *p, int access)
int nr;
do {
nr = shrink_slab(1000, GFP_KERNEL, 1000);
- if (page_count(p) == 0)
+ if (page_count(p) == 1)
break;
} while (nr > 10);
}
@@ -326,7 +327,7 @@ static void add_to_kill(struct task_struct *tsk, struct page *p,
* wrong earlier.
*/
static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
- int fail, unsigned long pfn)
+ int fail, struct page *page, unsigned long pfn)
{
struct to_kill *tk, *next;
@@ -351,7 +352,7 @@ static void kill_procs_ao(struct list_head *to_kill, int doit, int trapno,
* process anyways.
*/
else if (kill_proc_ao(tk->tsk, tk->addr, trapno,
- pfn) < 0)
+ pfn, page) < 0)
printk(KERN_ERR
"MCE %#lx: Cannot send advisory machine check signal to %s:%d\n",
pfn, tk->tsk->comm, tk->tsk->pid);
@@ -690,17 +691,29 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
/*
* Huge pages. Needs work.
* Issues:
- * No rmap support so we cannot find the original mapper. In theory could walk
- * all MMs and look for the mappings, but that would be non atomic and racy.
- * Need rmap for hugepages for this. Alternatively we could employ a heuristic,
- * like just walking the current process and hoping it has it mapped (that
- * should be usually true for the common "shared database cache" case)
- * Should handle free huge pages and dequeue them too, but this needs to
- * handle huge page accounting correctly.
+ * - Error on hugepage is contained in hugepage unit (not in raw page unit.)
+ * To narrow down kill region to one page, we need to break up pmd.
+ * - To support soft-offlining for hugepage, we need to support hugepage
+ * migration.
*/
static int me_huge_page(struct page *p, unsigned long pfn)
{
- return FAILED;
+ struct page *hpage = compound_head(p);
+ /*
+ * We can safely recover from error on free or reserved (i.e.
+ * not in-use) hugepage by dequeuing it from freelist.
+ * To check whether a hugepage is in-use or not, we can't use
+ * page->lru because it can be used in other hugepage operations,
+ * such as __unmap_hugepage_range() and gather_surplus_pages().
+ * So instead we use page_mapping() and PageAnon().
+ * We assume that this function is called with page lock held,
+ * so there is no race between isolation and mapping/unmapping.
+ */
+ if (!(page_mapping(hpage) || PageAnon(hpage))) {
+ __isolate_hwpoisoned_huge_page(hpage);
+ return RECOVERED;
+ }
+ return DELAYED;
}
/*
@@ -838,6 +851,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
int ret;
int i;
int kill = 1;
+ struct page *hpage = compound_head(p);
if (PageReserved(p) || PageSlab(p))
return SWAP_SUCCESS;
@@ -846,10 +860,10 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* This check implies we don't kill processes if their pages
* are in the swap cache early. Those are always late kills.
*/
- if (!page_mapped(p))
+ if (!page_mapped(hpage))
return SWAP_SUCCESS;
- if (PageCompound(p) || PageKsm(p))
+ if (PageKsm(p))
return SWAP_FAIL;
if (PageSwapCache(p)) {
@@ -864,10 +878,11 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* XXX: the dirty test could be racy: set_page_dirty() may not always
* be called inside page lock (it's recommended but not enforced).
*/
- mapping = page_mapping(p);
- if (!PageDirty(p) && mapping && mapping_cap_writeback_dirty(mapping)) {
- if (page_mkclean(p)) {
- SetPageDirty(p);
+ mapping = page_mapping(hpage);
+ if (!PageDirty(hpage) && mapping &&
+ mapping_cap_writeback_dirty(mapping)) {
+ if (page_mkclean(hpage)) {
+ SetPageDirty(hpage);
} else {
kill = 0;
ttu |= TTU_IGNORE_HWPOISON;
@@ -886,14 +901,14 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* there's nothing that can be done.
*/
if (kill)
- collect_procs(p, &tokill);
+ collect_procs(hpage, &tokill);
/*
* try_to_unmap can fail temporarily due to races.
* Try a few times (RED-PEN better strategy?)
*/
for (i = 0; i < N_UNMAP_TRIES; i++) {
- ret = try_to_unmap(p, ttu);
+ ret = try_to_unmap(hpage, ttu);
if (ret == SWAP_SUCCESS)
break;
pr_debug("MCE %#lx: try_to_unmap retry needed %d\n", pfn, ret);
@@ -901,7 +916,7 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
if (ret != SWAP_SUCCESS)
printk(KERN_ERR "MCE %#lx: failed to unmap page (mapcount=%d)\n",
- pfn, page_mapcount(p));
+ pfn, page_mapcount(hpage));
/*
* Now that the dirty bit has been propagated to the
@@ -912,17 +927,35 @@ static int hwpoison_user_mappings(struct page *p, unsigned long pfn,
* use a more force-full uncatchable kill to prevent
* any accesses to the poisoned memory.
*/
- kill_procs_ao(&tokill, !!PageDirty(p), trapno,
- ret != SWAP_SUCCESS, pfn);
+ kill_procs_ao(&tokill, !!PageDirty(hpage), trapno,
+ ret != SWAP_SUCCESS, p, pfn);
return ret;
}
+static void set_page_hwpoison_huge_page(struct page *hpage)
+{
+ int i;
+ int nr_pages = 1 << compound_order(hpage);
+ for (i = 0; i < nr_pages; i++)
+ SetPageHWPoison(hpage + i);
+}
+
+static void clear_page_hwpoison_huge_page(struct page *hpage)
+{
+ int i;
+ int nr_pages = 1 << compound_order(hpage);
+ for (i = 0; i < nr_pages; i++)
+ ClearPageHWPoison(hpage + i);
+}
+
int __memory_failure(unsigned long pfn, int trapno, int flags)
{
struct page_state *ps;
struct page *p;
+ struct page *hpage;
int res;
+ unsigned int nr_pages;
if (!sysctl_memory_failure_recovery)
panic("Memory failure from trap %d on page %lx", trapno, pfn);
@@ -935,12 +968,14 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
}
p = pfn_to_page(pfn);
+ hpage = compound_head(p);
if (TestSetPageHWPoison(p)) {
printk(KERN_ERR "MCE %#lx: already hardware poisoned\n", pfn);
return 0;
}
- atomic_long_add(1, &mce_bad_pages);
+ nr_pages = 1 << compound_order(hpage);
+ atomic_long_add(nr_pages, &mce_bad_pages);
/*
* We need/can do nothing about count=0 pages.
@@ -954,7 +989,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* that may make page_freeze_refs()/page_unfreeze_refs() mismatch.
*/
if (!(flags & MF_COUNT_INCREASED) &&
- !get_page_unless_zero(compound_head(p))) {
+ !get_page_unless_zero(hpage)) {
if (is_free_buddy_page(p)) {
action_result(pfn, "free buddy", DELAYED);
return 0;
@@ -972,9 +1007,9 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* The check (unnecessarily) ignores LRU pages being isolated and
* walked by the page reclaim code, however that's not a big loss.
*/
- if (!PageLRU(p))
+ if (!PageLRU(p) && !PageHuge(p))
shake_page(p, 0);
- if (!PageLRU(p)) {
+ if (!PageLRU(p) && !PageHuge(p)) {
/*
* shake_page could have turned it free.
*/
@@ -992,7 +1027,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
* It's very difficult to mess with pages currently under IO
* and in many cases impossible, so we just avoid it here.
*/
- lock_page_nosync(p);
+ lock_page_nosync(hpage);
/*
* unpoison always clear PG_hwpoison inside page lock
@@ -1004,11 +1039,31 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
}
if (hwpoison_filter(p)) {
if (TestClearPageHWPoison(p))
- atomic_long_dec(&mce_bad_pages);
- unlock_page(p);
- put_page(p);
+ atomic_long_sub(nr_pages, &mce_bad_pages);
+ unlock_page(hpage);
+ put_page(hpage);
+ return 0;
+ }
+
+ /*
+ * For error on the tail page, we should set PG_hwpoison
+ * on the head page to show that the hugepage is hwpoisoned
+ */
+ if (PageTail(p) && TestSetPageHWPoison(hpage)) {
+ action_result(pfn, "hugepage already hardware poisoned",
+ IGNORED);
+ unlock_page(hpage);
+ put_page(hpage);
return 0;
}
+ /*
+ * Set PG_hwpoison on all pages in an error hugepage,
+ * because containment is done in hugepage unit for now.
+ * Since we have done TestSetPageHWPoison() for the head page with
+ * page lock held, we can safely set PG_hwpoison bits on tail pages.
+ */
+ if (PageHuge(p))
+ set_page_hwpoison_huge_page(hpage);
wait_on_page_writeback(p);
@@ -1039,7 +1094,7 @@ int __memory_failure(unsigned long pfn, int trapno, int flags)
}
}
out:
- unlock_page(p);
+ unlock_page(hpage);
return res;
}
EXPORT_SYMBOL_GPL(__memory_failure);
@@ -1083,6 +1138,7 @@ int unpoison_memory(unsigned long pfn)
struct page *page;
struct page *p;
int freeit = 0;
+ unsigned int nr_pages;
if (!pfn_valid(pfn))
return -ENXIO;
@@ -1095,9 +1151,11 @@ int unpoison_memory(unsigned long pfn)
return 0;
}
+ nr_pages = 1 << compound_order(page);
+
if (!get_page_unless_zero(page)) {
if (TestClearPageHWPoison(p))
- atomic_long_dec(&mce_bad_pages);
+ atomic_long_sub(nr_pages, &mce_bad_pages);
pr_debug("MCE: Software-unpoisoned free page %#lx\n", pfn);
return 0;
}
@@ -1109,11 +1167,13 @@ int unpoison_memory(unsigned long pfn)
* the PG_hwpoison page will be caught and isolated on the entrance to
* the free buddy page pool.
*/
- if (TestClearPageHWPoison(p)) {
+ if (TestClearPageHWPoison(page)) {
pr_debug("MCE: Software-unpoisoned page %#lx\n", pfn);
- atomic_long_dec(&mce_bad_pages);
+ atomic_long_sub(nr_pages, &mce_bad_pages);
freeit = 1;
}
+ if (PageHuge(p))
+ clear_page_hwpoison_huge_page(page);
unlock_page(page);
put_page(page);
diff --git a/mm/memory.c b/mm/memory.c
index 858829d..98b58fe 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2623,7 +2623,7 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned int flags, pte_t orig_pte)
{
spinlock_t *ptl;
- struct page *page;
+ struct page *page, *swapcache = NULL;
swp_entry_t entry;
pte_t pte;
struct mem_cgroup *ptr = NULL;
@@ -2679,10 +2679,25 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
lock_page(page);
delayacct_clear_flag(DELAYACCT_PF_SWAPIN);
- page = ksm_might_need_to_copy(page, vma, address);
- if (!page) {
- ret = VM_FAULT_OOM;
- goto out;
+ /*
+ * Make sure try_to_free_swap or reuse_swap_page or swapoff did not
+ * release the swapcache from under us. The page pin, and pte_same
+ * test below, are not enough to exclude that. Even if it is still
+ * swapcache, we need to check that the page's swap has not changed.
+ */
+ if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val))
+ goto out_page;
+
+ if (ksm_might_need_to_copy(page, vma, address)) {
+ swapcache = page;
+ page = ksm_does_need_to_copy(page, vma, address);
+
+ if (unlikely(!page)) {
+ ret = VM_FAULT_OOM;
+ page = swapcache;
+ swapcache = NULL;
+ goto out_page;
+ }
}
if (mem_cgroup_try_charge_swapin(mm, page, GFP_KERNEL, &ptr)) {
@@ -2735,6 +2750,18 @@ static int do_swap_page(struct mm_struct *mm, struct vm_area_struct *vma,
if (vm_swap_full() || (vma->vm_flags & VM_LOCKED) || PageMlocked(page))
try_to_free_swap(page);
unlock_page(page);
+ if (swapcache) {
+ /*
+ * Hold the lock to avoid the swap entry to be reused
+ * until we take the PT lock for the pte_same() check
+ * (to avoid false positives from pte_same). For
+ * further safety release the lock after the swap_free
+ * so that the swap count won't change under a
+ * parallel locked swapcache.
+ */
+ unlock_page(swapcache);
+ page_cache_release(swapcache);
+ }
if (flags & FAULT_FLAG_WRITE) {
ret |= do_wp_page(mm, vma, address, page_table, pmd, ptl, pte);
@@ -2756,10 +2783,48 @@ out_page:
unlock_page(page);
out_release:
page_cache_release(page);
+ if (swapcache) {
+ unlock_page(swapcache);
+ page_cache_release(swapcache);
+ }
return ret;
}
/*
+ * This is like a special single-page "expand_{down|up}wards()",
+ * except we must first make sure that 'address{-|+}PAGE_SIZE'
+ * doesn't hit another vma.
+ */
+static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned long address)
+{
+ address &= PAGE_MASK;
+ if ((vma->vm_flags & VM_GROWSDOWN) && address == vma->vm_start) {
+ struct vm_area_struct *prev = vma->vm_prev;
+
+ /*
+ * Is there a mapping abutting this one below?
+ *
+ * That's only ok if it's the same stack mapping
+ * that has gotten split..
+ */
+ if (prev && prev->vm_end == address)
+ return prev->vm_flags & VM_GROWSDOWN ? 0 : -ENOMEM;
+
+ expand_stack(vma, address - PAGE_SIZE);
+ }
+ if ((vma->vm_flags & VM_GROWSUP) && address + PAGE_SIZE == vma->vm_end) {
+ struct vm_area_struct *next = vma->vm_next;
+
+ /* As VM_GROWSDOWN but s/below/above/ */
+ if (next && next->vm_start == address + PAGE_SIZE)
+ return next->vm_flags & VM_GROWSUP ? 0 : -ENOMEM;
+
+ expand_upwards(vma, address + PAGE_SIZE);
+ }
+ return 0;
+}
+
+/*
* We enter with non-exclusive mmap_sem (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with mmap_sem still held, but pte unmapped and unlocked.
@@ -2772,19 +2837,23 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
spinlock_t *ptl;
pte_t entry;
+ pte_unmap(page_table);
+
+ /* Check if we need to add a guard page to the stack */
+ if (check_stack_guard_page(vma, address) < 0)
+ return VM_FAULT_SIGBUS;
+
+ /* Use the zero-page for reads */
if (!(flags & FAULT_FLAG_WRITE)) {
entry = pte_mkspecial(pfn_pte(my_zero_pfn(address),
vma->vm_page_prot));
- ptl = pte_lockptr(mm, pmd);
- spin_lock(ptl);
+ page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
if (!pte_none(*page_table))
goto unlock;
goto setpte;
}
/* Allocate our own private page. */
- pte_unmap(page_table);
-
if (unlikely(anon_vma_prepare(vma)))
goto oom;
page = alloc_zeroed_user_highpage_movable(vma, address);
@@ -3116,7 +3185,7 @@ static inline int handle_pte_fault(struct mm_struct *mm,
* with threads.
*/
if (flags & FAULT_FLAG_WRITE)
- flush_tlb_page(vma, address);
+ flush_tlb_fix_spurious_fault(vma, address);
}
unlock:
pte_unmap_unlock(pte, ptl);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index a4cfcdc..d4e940a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -584,19 +584,19 @@ static inline int pageblock_free(struct page *page)
/* Return the start of the next active pageblock after a given page */
static struct page *next_active_pageblock(struct page *page)
{
- int pageblocks_stride;
-
/* Ensure the starting page is pageblock-aligned */
BUG_ON(page_to_pfn(page) & (pageblock_nr_pages - 1));
- /* Move forward by at least 1 * pageblock_nr_pages */
- pageblocks_stride = 1;
-
/* If the entire pageblock is free, move to the end of free page */
- if (pageblock_free(page))
- pageblocks_stride += page_order(page) - pageblock_order;
+ if (pageblock_free(page)) {
+ int order;
+ /* be careful. we don't have locks, page_order can be changed.*/
+ order = page_order(page);
+ if ((order < MAX_ORDER) && (order >= pageblock_order))
+ return page + (1 << order);
+ }
- return page + (pageblocks_stride * pageblock_nr_pages);
+ return page + pageblock_nr_pages;
}
/* Checks if this range of memory is likely to be hot-removable. */
@@ -840,7 +840,6 @@ repeat:
ret = 0;
if (drain) {
lru_add_drain_all();
- flush_scheduled_work();
cond_resched();
drain_all_pages();
}
@@ -862,7 +861,6 @@ repeat:
}
/* drain all zone's lru pagevec, this is asyncronous... */
lru_add_drain_all();
- flush_scheduled_work();
yield();
/* drain pcp pages , this is synchrouns. */
drain_all_pages();
diff --git a/mm/mlock.c b/mm/mlock.c
index 3f82720..b70919c 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -135,6 +135,13 @@ void munlock_vma_page(struct page *page)
}
}
+static inline int stack_guard_page(struct vm_area_struct *vma, unsigned long addr)
+{
+ return (vma->vm_flags & VM_GROWSDOWN) &&
+ (vma->vm_start == addr) &&
+ !vma_stack_continue(vma->vm_prev, addr);
+}
+
/**
* __mlock_vma_pages_range() - mlock a range of pages in the vma.
* @vma: target vma
@@ -167,6 +174,12 @@ static long __mlock_vma_pages_range(struct vm_area_struct *vma,
if (vma->vm_flags & VM_WRITE)
gup_flags |= FOLL_WRITE;
+ /* We don't try to access the guard page of a stack vma */
+ if (stack_guard_page(vma, start)) {
+ addr += PAGE_SIZE;
+ nr_pages--;
+ }
+
while (nr_pages > 0) {
int i;
diff --git a/mm/mmap.c b/mm/mmap.c
index 3100333..00161a4 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -388,17 +388,23 @@ static inline void
__vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, struct rb_node *rb_parent)
{
+ struct vm_area_struct *next;
+
+ vma->vm_prev = prev;
if (prev) {
- vma->vm_next = prev->vm_next;
+ next = prev->vm_next;
prev->vm_next = vma;
} else {
mm->mmap = vma;
if (rb_parent)
- vma->vm_next = rb_entry(rb_parent,
+ next = rb_entry(rb_parent,
struct vm_area_struct, vm_rb);
else
- vma->vm_next = NULL;
+ next = NULL;
}
+ vma->vm_next = next;
+ if (next)
+ next->vm_prev = vma;
}
void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma,
@@ -483,7 +489,11 @@ static inline void
__vma_unlink(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev)
{
- prev->vm_next = vma->vm_next;
+ struct vm_area_struct *next = vma->vm_next;
+
+ prev->vm_next = next;
+ if (next)
+ next->vm_prev = prev;
rb_erase(&vma->vm_rb, &mm->mm_rb);
if (mm->mmap_cache == vma)
mm->mmap_cache = prev;
@@ -1706,9 +1716,6 @@ static int acct_stack_growth(struct vm_area_struct *vma, unsigned long size, uns
* PA-RISC uses this for its stack; IA64 for its Register Backing Store.
* vma is the last one with address > vma->vm_end. Have to extend vma.
*/
-#ifndef CONFIG_IA64
-static
-#endif
int expand_upwards(struct vm_area_struct *vma, unsigned long address)
{
int error;
@@ -1915,6 +1922,7 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr;
insertion_point = (prev ? &prev->vm_next : &mm->mmap);
+ vma->vm_prev = NULL;
do {
rb_erase(&vma->vm_rb, &mm->mm_rb);
mm->map_count--;
@@ -1922,6 +1930,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
vma = vma->vm_next;
} while (vma && vma->vm_start < end);
*insertion_point = vma;
+ if (vma)
+ vma->vm_prev = prev;
tail_vma->vm_next = NULL;
if (mm->unmap_area == arch_unmap_area)
addr = prev ? prev->vm_end : mm->mmap_base;
@@ -1999,6 +2009,7 @@ static int __split_vma(struct mm_struct * mm, struct vm_area_struct * vma,
removed_exe_file_vma(mm);
fput(new->vm_file);
}
+ unlink_anon_vmas(new);
out_free_mpol:
mpol_put(pol);
out_free_vma:
diff --git a/mm/mmzone.c b/mm/mmzone.c
index f5b7d17..e35bfb8 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -87,3 +87,24 @@ int memmap_valid_within(unsigned long pfn,
return 1;
}
#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
+
+#ifdef CONFIG_SMP
+/* Called when a more accurate view of NR_FREE_PAGES is needed */
+unsigned long zone_nr_free_pages(struct zone *zone)
+{
+ unsigned long nr_free_pages = zone_page_state(zone, NR_FREE_PAGES);
+
+ /*
+ * While kswapd is awake, it is considered the zone is under some
+ * memory pressure. Under pressure, there is a risk that
+ * per-cpu-counter-drift will allow the min watermark to be breached
+ * potentially causing a live-lock. While kswapd is awake and
+ * free pages are low, get a better estimate for free pages
+ */
+ if (nr_free_pages < zone->percpu_drift_mark &&
+ !waitqueue_active(&zone->zone_pgdat->kswapd_wait))
+ return zone_page_state_snapshot(zone, NR_FREE_PAGES);
+
+ return nr_free_pages;
+}
+#endif /* CONFIG_SMP */
diff --git a/mm/nommu.c b/mm/nommu.c
index b76f3ee..88ff091 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -36,11 +36,6 @@
#include <asm/mmu_context.h>
#include "internal.h"
-static inline __attribute__((format(printf, 1, 2)))
-void no_printk(const char *fmt, ...)
-{
-}
-
#if 0
#define kenter(FMT, ...) \
printk(KERN_DEBUG "==> %s("FMT")\n", __func__, ##__VA_ARGS__)
@@ -609,7 +604,7 @@ static void protect_vma(struct vm_area_struct *vma, unsigned long flags)
*/
static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
{
- struct vm_area_struct *pvma, **pp;
+ struct vm_area_struct *pvma, **pp, *next;
struct address_space *mapping;
struct rb_node **p, *parent;
@@ -669,8 +664,11 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
break;
}
- vma->vm_next = *pp;
+ next = *pp;
*pp = vma;
+ vma->vm_next = next;
+ if (next)
+ next->vm_prev = vma;
}
/*
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index d3def05..4029583 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -106,7 +106,7 @@ static void boost_dying_task_prio(struct task_struct *p,
* pointer. Return p, or any of its subthreads with a valid ->mm, with
* task_lock() held.
*/
-static struct task_struct *find_lock_task_mm(struct task_struct *p)
+struct task_struct *find_lock_task_mm(struct task_struct *p)
{
struct task_struct *t = p;
@@ -121,8 +121,8 @@ static struct task_struct *find_lock_task_mm(struct task_struct *p)
}
/* return true if the task is not adequate as candidate victim task. */
-static bool oom_unkillable_task(struct task_struct *p, struct mem_cgroup *mem,
- const nodemask_t *nodemask)
+static bool oom_unkillable_task(struct task_struct *p,
+ const struct mem_cgroup *mem, const nodemask_t *nodemask)
{
if (is_global_init(p))
return true;
@@ -208,8 +208,13 @@ unsigned int oom_badness(struct task_struct *p, struct mem_cgroup *mem,
*/
points += p->signal->oom_score_adj;
- if (points < 0)
- return 0;
+ /*
+ * Never return 0 for an eligible task that may be killed since it's
+ * possible that no single user task uses more than 0.1% of memory and
+ * no single admin tasks uses more than 3.0%.
+ */
+ if (points <= 0)
+ return 1;
return (points < 1000) ? points : 1000;
}
@@ -339,26 +344,24 @@ static struct task_struct *select_bad_process(unsigned int *ppoints,
/**
* dump_tasks - dump current memory state of all system tasks
* @mem: current's memory controller, if constrained
+ * @nodemask: nodemask passed to page allocator for mempolicy ooms
*
- * Dumps the current memory state of all system tasks, excluding kernel threads.
+ * Dumps the current memory state of all eligible tasks. Tasks not in the same
+ * memcg, not in the same cpuset, or bound to a disjoint set of mempolicy nodes
+ * are not shown.
* State information includes task's pid, uid, tgid, vm size, rss, cpu, oom_adj
* value, oom_score_adj value, and name.
*
- * If the actual is non-NULL, only tasks that are a member of the mem_cgroup are
- * shown.
- *
* Call with tasklist_lock read-locked.
*/
-static void dump_tasks(const struct mem_cgroup *mem)
+static void dump_tasks(const struct mem_cgroup *mem, const nodemask_t *nodemask)
{
struct task_struct *p;
struct task_struct *task;
pr_info("[ pid ] uid tgid total_vm rss cpu oom_adj oom_score_adj name\n");
for_each_process(p) {
- if (p->flags & PF_KTHREAD)
- continue;
- if (mem && !task_in_mem_cgroup(p, mem))
+ if (oom_unkillable_task(p, mem, nodemask))
continue;
task = find_lock_task_mm(p);
@@ -372,7 +375,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
}
pr_info("[%5d] %5d %5d %8lu %8lu %3u %3d %5d %s\n",
- task->pid, __task_cred(task)->uid, task->tgid,
+ task->pid, task_uid(task), task->tgid,
task->mm->total_vm, get_mm_rss(task->mm),
task_cpu(task), task->signal->oom_adj,
task->signal->oom_score_adj, task->comm);
@@ -381,7 +384,7 @@ static void dump_tasks(const struct mem_cgroup *mem)
}
static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
- struct mem_cgroup *mem)
+ struct mem_cgroup *mem, const nodemask_t *nodemask)
{
task_lock(current);
pr_warning("%s invoked oom-killer: gfp_mask=0x%x, order=%d, "
@@ -394,17 +397,16 @@ static void dump_header(struct task_struct *p, gfp_t gfp_mask, int order,
mem_cgroup_print_oom_info(mem, p);
show_mem();
if (sysctl_oom_dump_tasks)
- dump_tasks(mem);
+ dump_tasks(mem, nodemask);
}
#define K(x) ((x) << (PAGE_SHIFT-10))
static int oom_kill_task(struct task_struct *p, struct mem_cgroup *mem)
{
p = find_lock_task_mm(p);
- if (!p) {
- task_unlock(p);
+ if (!p)
return 1;
- }
+
pr_err("Killed process %d (%s) total-vm:%lukB, anon-rss:%lukB, file-rss:%lukB\n",
task_pid_nr(p), p->comm, K(p->mm->total_vm),
K(get_mm_counter(p->mm, MM_ANONPAGES)),
@@ -437,7 +439,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
unsigned int victim_points = 0;
if (printk_ratelimit())
- dump_header(p, gfp_mask, order, mem);
+ dump_header(p, gfp_mask, order, mem, nodemask);
/*
* If the task is already exiting, don't alarm the sysadmin or kill
@@ -483,7 +485,7 @@ static int oom_kill_process(struct task_struct *p, gfp_t gfp_mask, int order,
* Determines whether the kernel must panic because of the panic_on_oom sysctl.
*/
static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
- int order)
+ int order, const nodemask_t *nodemask)
{
if (likely(!sysctl_panic_on_oom))
return;
@@ -497,7 +499,7 @@ static void check_panic_on_oom(enum oom_constraint constraint, gfp_t gfp_mask,
return;
}
read_lock(&tasklist_lock);
- dump_header(NULL, gfp_mask, order, NULL);
+ dump_header(NULL, gfp_mask, order, NULL, nodemask);
read_unlock(&tasklist_lock);
panic("Out of memory: %s panic_on_oom is enabled\n",
sysctl_panic_on_oom == 2 ? "compulsory" : "system-wide");
@@ -510,7 +512,7 @@ void mem_cgroup_out_of_memory(struct mem_cgroup *mem, gfp_t gfp_mask)
unsigned int points = 0;
struct task_struct *p;
- check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0);
+ check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, 0, NULL);
limit = mem_cgroup_get_limit(mem) >> PAGE_SHIFT;
read_lock(&tasklist_lock);
retry:
@@ -642,11 +644,13 @@ static void clear_system_oom(void)
void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
int order, nodemask_t *nodemask)
{
+ const nodemask_t *mpol_mask;
struct task_struct *p;
unsigned long totalpages;
unsigned long freed = 0;
unsigned int points;
enum oom_constraint constraint = CONSTRAINT_NONE;
+ int killed = 0;
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
if (freed > 0)
@@ -670,7 +674,8 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
*/
constraint = constrained_alloc(zonelist, gfp_mask, nodemask,
&totalpages);
- check_panic_on_oom(constraint, gfp_mask, order);
+ mpol_mask = (constraint == CONSTRAINT_MEMORY_POLICY) ? nodemask : NULL;
+ check_panic_on_oom(constraint, gfp_mask, order, mpol_mask);
read_lock(&tasklist_lock);
if (sysctl_oom_kill_allocating_task &&
@@ -684,19 +689,17 @@ void out_of_memory(struct zonelist *zonelist, gfp_t gfp_mask,
if (!oom_kill_process(current, gfp_mask, order, 0, totalpages,
NULL, nodemask,
"Out of memory (oom_kill_allocating_task)"))
- return;
+ goto out;
}
retry:
- p = select_bad_process(&points, totalpages, NULL,
- constraint == CONSTRAINT_MEMORY_POLICY ? nodemask :
- NULL);
+ p = select_bad_process(&points, totalpages, NULL, mpol_mask);
if (PTR_ERR(p) == -1UL)
- return;
+ goto out;
/* Found nothing?!?! Either we hang forever, or we panic. */
if (!p) {
- dump_header(NULL, gfp_mask, order, NULL);
+ dump_header(NULL, gfp_mask, order, NULL, mpol_mask);
read_unlock(&tasklist_lock);
panic("Out of memory and no killable processes...\n");
}
@@ -704,13 +707,15 @@ retry:
if (oom_kill_process(p, gfp_mask, order, points, totalpages, NULL,
nodemask, "Out of memory"))
goto retry;
+ killed = 1;
+out:
read_unlock(&tasklist_lock);
/*
* Give "p" a good chance of killing itself before we
* retry to allocate memory unless "p" is current
*/
- if (!test_thread_flag(TIF_MEMDIE))
+ if (killed && !test_thread_flag(TIF_MEMDIE))
schedule_timeout_uninterruptible(1);
}
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 0c6258b..e3bccac 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -253,32 +253,6 @@ static void bdi_writeout_fraction(struct backing_dev_info *bdi,
}
}
-/*
- * Clip the earned share of dirty pages to that which is actually available.
- * This avoids exceeding the total dirty_limit when the floating averages
- * fluctuate too quickly.
- */
-static void clip_bdi_dirty_limit(struct backing_dev_info *bdi,
- unsigned long dirty, unsigned long *pbdi_dirty)
-{
- unsigned long avail_dirty;
-
- avail_dirty = global_page_state(NR_FILE_DIRTY) +
- global_page_state(NR_WRITEBACK) +
- global_page_state(NR_UNSTABLE_NFS) +
- global_page_state(NR_WRITEBACK_TEMP);
-
- if (avail_dirty < dirty)
- avail_dirty = dirty - avail_dirty;
- else
- avail_dirty = 0;
-
- avail_dirty += bdi_stat(bdi, BDI_RECLAIMABLE) +
- bdi_stat(bdi, BDI_WRITEBACK);
-
- *pbdi_dirty = min(*pbdi_dirty, avail_dirty);
-}
-
static inline void task_dirties_fraction(struct task_struct *tsk,
long *numerator, long *denominator)
{
@@ -287,16 +261,24 @@ static inline void task_dirties_fraction(struct task_struct *tsk,
}
/*
- * scale the dirty limit
+ * task_dirty_limit - scale down dirty throttling threshold for one task
*
* task specific dirty limit:
*
* dirty -= (dirty/8) * p_{t}
+ *
+ * To protect light/slow dirtying tasks from heavier/fast ones, we start
+ * throttling individual tasks before reaching the bdi dirty limit.
+ * Relatively low thresholds will be allocated to heavy dirtiers. So when
+ * dirty pages grow large, heavy dirtiers will be throttled first, which will
+ * effectively curb the growth of dirty pages. Light dirtiers with high enough
+ * dirty threshold may never get throttled.
*/
-static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
+static unsigned long task_dirty_limit(struct task_struct *tsk,
+ unsigned long bdi_dirty)
{
long numerator, denominator;
- unsigned long dirty = *pdirty;
+ unsigned long dirty = bdi_dirty;
u64 inv = dirty >> 3;
task_dirties_fraction(tsk, &numerator, &denominator);
@@ -304,10 +286,8 @@ static void task_dirty_limit(struct task_struct *tsk, unsigned long *pdirty)
do_div(inv, denominator);
dirty -= inv;
- if (dirty < *pdirty/2)
- dirty = *pdirty/2;
- *pdirty = dirty;
+ return max(dirty, bdi_dirty/2);
}
/*
@@ -417,9 +397,16 @@ unsigned long determine_dirtyable_memory(void)
return x + 1; /* Ensure that we never return 0 */
}
-void
-get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
- unsigned long *pbdi_dirty, struct backing_dev_info *bdi)
+/*
+ * global_dirty_limits - background-writeback and dirty-throttling thresholds
+ *
+ * Calculate the dirty thresholds based on sysctl parameters
+ * - vm.dirty_background_ratio or vm.dirty_background_bytes
+ * - vm.dirty_ratio or vm.dirty_bytes
+ * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
+ * runtime tasks.
+ */
+void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty)
{
unsigned long background;
unsigned long dirty;
@@ -451,27 +438,37 @@ get_dirty_limits(unsigned long *pbackground, unsigned long *pdirty,
}
*pbackground = background;
*pdirty = dirty;
+}
- if (bdi) {
- u64 bdi_dirty;
- long numerator, denominator;
+/*
+ * bdi_dirty_limit - @bdi's share of dirty throttling threshold
+ *
+ * Allocate high/low dirty limits to fast/slow devices, in order to prevent
+ * - starving fast devices
+ * - piling up dirty pages (that will take long time to sync) on slow devices
+ *
+ * The bdi's share of dirty limit will be adapting to its throughput and
+ * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
+ */
+unsigned long bdi_dirty_limit(struct backing_dev_info *bdi, unsigned long dirty)
+{
+ u64 bdi_dirty;
+ long numerator, denominator;
- /*
- * Calculate this BDI's share of the dirty ratio.
- */
- bdi_writeout_fraction(bdi, &numerator, &denominator);
-
- bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
- bdi_dirty *= numerator;
- do_div(bdi_dirty, denominator);
- bdi_dirty += (dirty * bdi->min_ratio) / 100;
- if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
- bdi_dirty = dirty * bdi->max_ratio / 100;
-
- *pbdi_dirty = bdi_dirty;
- clip_bdi_dirty_limit(bdi, dirty, pbdi_dirty);
- task_dirty_limit(current, pbdi_dirty);
- }
+ /*
+ * Calculate this BDI's share of the dirty ratio.
+ */
+ bdi_writeout_fraction(bdi, &numerator, &denominator);
+
+ bdi_dirty = (dirty * (100 - bdi_min_ratio)) / 100;
+ bdi_dirty *= numerator;
+ do_div(bdi_dirty, denominator);
+
+ bdi_dirty += (dirty * bdi->min_ratio) / 100;
+ if (bdi_dirty > (dirty * bdi->max_ratio) / 100)
+ bdi_dirty = dirty * bdi->max_ratio / 100;
+
+ return bdi_dirty;
}
/*
@@ -491,7 +488,7 @@ static void balance_dirty_pages(struct address_space *mapping,
unsigned long bdi_thresh;
unsigned long pages_written = 0;
unsigned long pause = 1;
-
+ bool dirty_exceeded = false;
struct backing_dev_info *bdi = mapping->backing_dev_info;
for (;;) {
@@ -502,18 +499,11 @@ static void balance_dirty_pages(struct address_space *mapping,
.range_cyclic = 1,
};
- get_dirty_limits(&background_thresh, &dirty_thresh,
- &bdi_thresh, bdi);
-
nr_reclaimable = global_page_state(NR_FILE_DIRTY) +
global_page_state(NR_UNSTABLE_NFS);
nr_writeback = global_page_state(NR_WRITEBACK);
- bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
- bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
-
- if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
- break;
+ global_dirty_limits(&background_thresh, &dirty_thresh);
/*
* Throttle it only when the background writeback cannot
@@ -524,26 +514,8 @@ static void balance_dirty_pages(struct address_space *mapping,
(background_thresh + dirty_thresh) / 2)
break;
- if (!bdi->dirty_exceeded)
- bdi->dirty_exceeded = 1;
-
- /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
- * Unstable writes are a feature of certain networked
- * filesystems (i.e. NFS) in which data may have been
- * written to the server's write cache, but has not yet
- * been flushed to permanent storage.
- * Only move pages to writeback if this bdi is over its
- * threshold otherwise wait until the disk writes catch
- * up.
- */
- trace_wbc_balance_dirty_start(&wbc, bdi);
- if (bdi_nr_reclaimable > bdi_thresh) {
- writeback_inodes_wb(&bdi->wb, &wbc);
- pages_written += write_chunk - wbc.nr_to_write;
- get_dirty_limits(&background_thresh, &dirty_thresh,
- &bdi_thresh, bdi);
- trace_wbc_balance_dirty_written(&wbc, bdi);
- }
+ bdi_thresh = bdi_dirty_limit(bdi, dirty_thresh);
+ bdi_thresh = task_dirty_limit(current, bdi_thresh);
/*
* In order to avoid the stacked BDI deadlock we need
@@ -558,16 +530,44 @@ static void balance_dirty_pages(struct address_space *mapping,
if (bdi_thresh < 2*bdi_stat_error(bdi)) {
bdi_nr_reclaimable = bdi_stat_sum(bdi, BDI_RECLAIMABLE);
bdi_nr_writeback = bdi_stat_sum(bdi, BDI_WRITEBACK);
- } else if (bdi_nr_reclaimable) {
+ } else {
bdi_nr_reclaimable = bdi_stat(bdi, BDI_RECLAIMABLE);
bdi_nr_writeback = bdi_stat(bdi, BDI_WRITEBACK);
}
- if (bdi_nr_reclaimable + bdi_nr_writeback <= bdi_thresh)
+ /*
+ * The bdi thresh is somehow "soft" limit derived from the
+ * global "hard" limit. The former helps to prevent heavy IO
+ * bdi or process from holding back light ones; The latter is
+ * the last resort safeguard.
+ */
+ dirty_exceeded =
+ (bdi_nr_reclaimable + bdi_nr_writeback >= bdi_thresh)
+ || (nr_reclaimable + nr_writeback >= dirty_thresh);
+
+ if (!dirty_exceeded)
break;
- if (pages_written >= write_chunk)
- break; /* We've done our duty */
+ if (!bdi->dirty_exceeded)
+ bdi->dirty_exceeded = 1;
+
+ /* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
+ * Unstable writes are a feature of certain networked
+ * filesystems (i.e. NFS) in which data may have been
+ * written to the server's write cache, but has not yet
+ * been flushed to permanent storage.
+ * Only move pages to writeback if this bdi is over its
+ * threshold otherwise wait until the disk writes catch
+ * up.
+ */
+ trace_wbc_balance_dirty_start(&wbc, bdi);
+ if (bdi_nr_reclaimable > bdi_thresh) {
+ writeback_inodes_wb(&bdi->wb, &wbc);
+ pages_written += write_chunk - wbc.nr_to_write;
+ trace_wbc_balance_dirty_written(&wbc, bdi);
+ if (pages_written >= write_chunk)
+ break; /* We've done our duty */
+ }
trace_wbc_balance_dirty_wait(&wbc, bdi);
__set_current_state(TASK_INTERRUPTIBLE);
io_schedule_timeout(pause);
@@ -581,8 +581,7 @@ static void balance_dirty_pages(struct address_space *mapping,
pause = HZ / 10;
}
- if (bdi_nr_reclaimable + bdi_nr_writeback < bdi_thresh &&
- bdi->dirty_exceeded)
+ if (!dirty_exceeded && bdi->dirty_exceeded)
bdi->dirty_exceeded = 0;
if (writeback_in_progress(bdi))
@@ -597,9 +596,7 @@ static void balance_dirty_pages(struct address_space *mapping,
* background_thresh, to keep the amount of dirty memory low.
*/
if ((laptop_mode && pages_written) ||
- (!laptop_mode && ((global_page_state(NR_FILE_DIRTY)
- + global_page_state(NR_UNSTABLE_NFS))
- > background_thresh)))
+ (!laptop_mode && (nr_reclaimable > background_thresh)))
bdi_start_background_writeback(bdi);
}
@@ -663,7 +660,7 @@ void throttle_vm_writeout(gfp_t gfp_mask)
unsigned long dirty_thresh;
for ( ; ; ) {
- get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
+ global_dirty_limits(&background_thresh, &dirty_thresh);
/*
* Boost the allowable dirty threshold a bit for page
@@ -825,10 +822,10 @@ void __init page_writeback_init(void)
/*
* We tag pages in batches of WRITEBACK_TAG_BATCH to reduce tree_lock latency.
*/
-#define WRITEBACK_TAG_BATCH 4096
void tag_pages_for_writeback(struct address_space *mapping,
pgoff_t start, pgoff_t end)
{
+#define WRITEBACK_TAG_BATCH 4096
unsigned long tagged;
do {
@@ -839,7 +836,8 @@ void tag_pages_for_writeback(struct address_space *mapping,
spin_unlock_irq(&mapping->tree_lock);
WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH);
cond_resched();
- } while (tagged >= WRITEBACK_TAG_BATCH);
+ /* We check 'start' to handle wrapping when end == ~0UL */
+ } while (tagged >= WRITEBACK_TAG_BATCH && start);
}
EXPORT_SYMBOL(tag_pages_for_writeback);
@@ -987,22 +985,16 @@ continue_unlock:
}
}
- if (wbc->nr_to_write > 0) {
- if (--wbc->nr_to_write == 0 &&
- wbc->sync_mode == WB_SYNC_NONE) {
- /*
- * We stop writing back only if we are
- * not doing integrity sync. In case of
- * integrity sync we have to keep going
- * because someone may be concurrently
- * dirtying pages, and we might have
- * synced a lot of newly appeared dirty
- * pages, but have not synced all of the
- * old dirty pages.
- */
- done = 1;
- break;
- }
+ /*
+ * We stop writing back only if we are not doing
+ * integrity sync. In case of integrity sync we have to
+ * keep going until we have written all the pages
+ * we tagged for writeback prior to entering this loop.
+ */
+ if (--wbc->nr_to_write <= 0 &&
+ wbc->sync_mode == WB_SYNC_NONE) {
+ done = 1;
+ break;
}
}
pagevec_release(&pvec);
@@ -1134,6 +1126,7 @@ void account_page_dirtied(struct page *page, struct address_space *mapping)
task_io_account_write(PAGE_CACHE_SIZE);
}
}
+EXPORT_SYMBOL(account_page_dirtied);
/*
* For address_spaces which do not use buffers. Just tag the page as dirty in
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a9649f4..2a362c5 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -21,6 +21,7 @@
#include <linux/pagemap.h>
#include <linux/jiffies.h>
#include <linux/bootmem.h>
+#include <linux/memblock.h>
#include <linux/compiler.h>
#include <linux/kernel.h>
#include <linux/kmemcheck.h>
@@ -588,13 +589,13 @@ static void free_pcppages_bulk(struct zone *zone, int count,
{
int migratetype = 0;
int batch_free = 0;
+ int to_free = count;
spin_lock(&zone->lock);
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
- __mod_zone_page_state(zone, NR_FREE_PAGES, count);
- while (count) {
+ while (to_free) {
struct page *page;
struct list_head *list;
@@ -619,8 +620,9 @@ static void free_pcppages_bulk(struct zone *zone, int count,
/* MIGRATE_MOVABLE list may include MIGRATE_RESERVEs */
__free_one_page(page, zone, 0, page_private(page));
trace_mm_page_pcpu_drain(page, 0, page_private(page));
- } while (--count && --batch_free && !list_empty(list));
+ } while (--to_free && --batch_free && !list_empty(list));
}
+ __mod_zone_page_state(zone, NR_FREE_PAGES, count);
spin_unlock(&zone->lock);
}
@@ -631,8 +633,8 @@ static void free_one_page(struct zone *zone, struct page *page, int order,
zone->all_unreclaimable = 0;
zone->pages_scanned = 0;
- __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
__free_one_page(page, zone, order, migratetype);
+ __mod_zone_page_state(zone, NR_FREE_PAGES, 1 << order);
spin_unlock(&zone->lock);
}
@@ -1461,7 +1463,7 @@ int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
{
/* free_pages my go negative - that's OK */
long min = mark;
- long free_pages = zone_page_state(z, NR_FREE_PAGES) - (1 << order) + 1;
+ long free_pages = zone_nr_free_pages(z) - (1 << order) + 1;
int o;
if (alloc_flags & ALLOC_HIGH)
@@ -1846,6 +1848,7 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
struct page *page = NULL;
struct reclaim_state reclaim_state;
struct task_struct *p = current;
+ bool drained = false;
cond_resched();
@@ -1864,14 +1867,25 @@ __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
cond_resched();
- if (order != 0)
- drain_all_pages();
+ if (unlikely(!(*did_some_progress)))
+ return NULL;
- if (likely(*did_some_progress))
- page = get_page_from_freelist(gfp_mask, nodemask, order,
+retry:
+ page = get_page_from_freelist(gfp_mask, nodemask, order,
zonelist, high_zoneidx,
alloc_flags, preferred_zone,
migratetype);
+
+ /*
+ * If an allocation failed after direct reclaim, it could be because
+ * pages are pinned on the per-cpu lists. Drain them and try again
+ */
+ if (!page && !drained) {
+ drain_all_pages();
+ drained = true;
+ goto retry;
+ }
+
return page;
}
@@ -2423,7 +2437,7 @@ void show_free_areas(void)
" all_unreclaimable? %s"
"\n",
zone->name,
- K(zone_page_state(zone, NR_FREE_PAGES)),
+ K(zone_nr_free_pages(zone)),
K(min_wmark_pages(zone)),
K(low_wmark_pages(zone)),
K(high_wmark_pages(zone)),
@@ -3623,6 +3637,41 @@ void __init free_bootmem_with_active_regions(int nid,
}
}
+#ifdef CONFIG_HAVE_MEMBLOCK
+u64 __init find_memory_core_early(int nid, u64 size, u64 align,
+ u64 goal, u64 limit)
+{
+ int i;
+
+ /* Need to go over early_node_map to find out good range for node */
+ for_each_active_range_index_in_nid(i, nid) {
+ u64 addr;
+ u64 ei_start, ei_last;
+ u64 final_start, final_end;
+
+ ei_last = early_node_map[i].end_pfn;
+ ei_last <<= PAGE_SHIFT;
+ ei_start = early_node_map[i].start_pfn;
+ ei_start <<= PAGE_SHIFT;
+
+ final_start = max(ei_start, goal);
+ final_end = min(ei_last, limit);
+
+ if (final_start >= final_end)
+ continue;
+
+ addr = memblock_find_in_range(final_start, final_end, size, align);
+
+ if (addr == MEMBLOCK_ERROR)
+ continue;
+
+ return addr;
+ }
+
+ return MEMBLOCK_ERROR;
+}
+#endif
+
int __init add_from_early_node_map(struct range *range, int az,
int nr_range, int nid)
{
@@ -3642,46 +3691,26 @@ int __init add_from_early_node_map(struct range *range, int az,
void * __init __alloc_memory_core_early(int nid, u64 size, u64 align,
u64 goal, u64 limit)
{
- int i;
void *ptr;
+ u64 addr;
- if (limit > get_max_mapped())
- limit = get_max_mapped();
+ if (limit > memblock.current_limit)
+ limit = memblock.current_limit;
- /* need to go over early_node_map to find out good range for node */
- for_each_active_range_index_in_nid(i, nid) {
- u64 addr;
- u64 ei_start, ei_last;
+ addr = find_memory_core_early(nid, size, align, goal, limit);
- ei_last = early_node_map[i].end_pfn;
- ei_last <<= PAGE_SHIFT;
- ei_start = early_node_map[i].start_pfn;
- ei_start <<= PAGE_SHIFT;
- addr = find_early_area(ei_start, ei_last,
- goal, limit, size, align);
-
- if (addr == -1ULL)
- continue;
-
-#if 0
- printk(KERN_DEBUG "alloc (nid=%d %llx - %llx) (%llx - %llx) %llx %llx => %llx\n",
- nid,
- ei_start, ei_last, goal, limit, size,
- align, addr);
-#endif
-
- ptr = phys_to_virt(addr);
- memset(ptr, 0, size);
- reserve_early_without_check(addr, addr + size, "BOOTMEM");
- /*
- * The min_count is set to 0 so that bootmem allocated blocks
- * are never reported as leaks.
- */
- kmemleak_alloc(ptr, size, 0, 0);
- return ptr;
- }
+ if (addr == MEMBLOCK_ERROR)
+ return NULL;
- return NULL;
+ ptr = phys_to_virt(addr);
+ memset(ptr, 0, size);
+ memblock_x86_reserve_range(addr, addr + size, "BOOTMEM");
+ /*
+ * The min_count is set to 0 so that bootmem allocated blocks
+ * are never reported as leaks.
+ */
+ kmemleak_alloc(ptr, size, 0, 0);
+ return ptr;
}
#endif
@@ -5169,9 +5198,9 @@ void *__init alloc_large_system_hash(const char *tablename,
if (!table)
panic("Failed to allocate %s hash table\n", tablename);
- printk(KERN_INFO "%s hash table entries: %d (order: %d, %lu bytes)\n",
+ printk(KERN_INFO "%s hash table entries: %ld (order: %d, %lu bytes)\n",
tablename,
- (1U << log2qty),
+ (1UL << log2qty),
ilog2(size) - PAGE_SHIFT,
size);
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index df68085..89633fe 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -27,7 +27,7 @@
* chunk size is not aligned. percpu-km code will whine about it.
*/
-#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+#if defined(CONFIG_SMP) && defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
#error "contiguous percpu allocation is incompatible with paged first chunk"
#endif
@@ -35,7 +35,11 @@
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
{
- /* noop */
+ unsigned int cpu;
+
+ for_each_possible_cpu(cpu)
+ memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
+
return 0;
}
diff --git a/mm/percpu.c b/mm/percpu.c
index 039f51a..efe8168 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -76,6 +76,7 @@
#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
+#ifdef CONFIG_SMP
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
#ifndef __addr_to_pcpu_ptr
#define __addr_to_pcpu_ptr(addr) \
@@ -89,6 +90,11 @@
(unsigned long)pcpu_base_addr - \
(unsigned long)__per_cpu_start)
#endif
+#else /* CONFIG_SMP */
+/* on UP, it's always identity mapped */
+#define __addr_to_pcpu_ptr(addr) (void __percpu *)(addr)
+#define __pcpu_ptr_to_addr(ptr) (void __force *)(ptr)
+#endif /* CONFIG_SMP */
struct pcpu_chunk {
struct list_head list; /* linked to pcpu_slot lists */
@@ -393,7 +399,9 @@ static int pcpu_extend_area_map(struct pcpu_chunk *chunk, int new_alloc)
goto out_unlock;
old_size = chunk->map_alloc * sizeof(chunk->map[0]);
- memcpy(new, chunk->map, old_size);
+ old = chunk->map;
+
+ memcpy(new, old, old_size);
chunk->map_alloc = new_alloc;
chunk->map = new;
@@ -818,8 +826,8 @@ fail_unlock_mutex:
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
- * Allocate percpu area of @size bytes aligned at @align. Might
- * sleep. Might trigger writeouts.
+ * Allocate zero-filled percpu area of @size bytes aligned at @align.
+ * Might sleep. Might trigger writeouts.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
@@ -838,9 +846,10 @@ EXPORT_SYMBOL_GPL(__alloc_percpu);
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
- * Allocate percpu area of @size bytes aligned at @align from reserved
- * percpu area if arch has set it up; otherwise, allocation is served
- * from the same dynamic area. Might sleep. Might trigger writeouts.
+ * Allocate zero-filled percpu area of @size bytes aligned at @align
+ * from reserved percpu area if arch has set it up; otherwise,
+ * allocation is served from the same dynamic area. Might sleep.
+ * Might trigger writeouts.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
@@ -947,6 +956,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
*/
bool is_kernel_percpu_address(unsigned long addr)
{
+#ifdef CONFIG_SMP
const size_t static_size = __per_cpu_end - __per_cpu_start;
void __percpu *base = __addr_to_pcpu_ptr(pcpu_base_addr);
unsigned int cpu;
@@ -957,6 +967,8 @@ bool is_kernel_percpu_address(unsigned long addr)
if ((void *)addr >= start && (void *)addr < start + static_size)
return true;
}
+#endif
+ /* on UP, can't distinguish from other static vars, always false */
return false;
}
@@ -1065,161 +1077,6 @@ void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
}
/**
- * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
- * @reserved_size: the size of reserved percpu area in bytes
- * @dyn_size: minimum free size for dynamic allocation in bytes
- * @atom_size: allocation atom size
- * @cpu_distance_fn: callback to determine distance between cpus, optional
- *
- * This function determines grouping of units, their mappings to cpus
- * and other parameters considering needed percpu size, allocation
- * atom size and distances between CPUs.
- *
- * Groups are always mutliples of atom size and CPUs which are of
- * LOCAL_DISTANCE both ways are grouped together and share space for
- * units in the same group. The returned configuration is guaranteed
- * to have CPUs on different nodes on different groups and >=75% usage
- * of allocated virtual address space.
- *
- * RETURNS:
- * On success, pointer to the new allocation_info is returned. On
- * failure, ERR_PTR value is returned.
- */
-static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
- size_t reserved_size, size_t dyn_size,
- size_t atom_size,
- pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
-{
- static int group_map[NR_CPUS] __initdata;
- static int group_cnt[NR_CPUS] __initdata;
- const size_t static_size = __per_cpu_end - __per_cpu_start;
- int nr_groups = 1, nr_units = 0;
- size_t size_sum, min_unit_size, alloc_size;
- int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
- int last_allocs, group, unit;
- unsigned int cpu, tcpu;
- struct pcpu_alloc_info *ai;
- unsigned int *cpu_map;
-
- /* this function may be called multiple times */
- memset(group_map, 0, sizeof(group_map));
- memset(group_cnt, 0, sizeof(group_cnt));
-
- /* calculate size_sum and ensure dyn_size is enough for early alloc */
- size_sum = PFN_ALIGN(static_size + reserved_size +
- max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
- dyn_size = size_sum - static_size - reserved_size;
-
- /*
- * Determine min_unit_size, alloc_size and max_upa such that
- * alloc_size is multiple of atom_size and is the smallest
- * which can accomodate 4k aligned segments which are equal to
- * or larger than min_unit_size.
- */
- min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
-
- alloc_size = roundup(min_unit_size, atom_size);
- upa = alloc_size / min_unit_size;
- while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
- upa--;
- max_upa = upa;
-
- /* group cpus according to their proximity */
- for_each_possible_cpu(cpu) {
- group = 0;
- next_group:
- for_each_possible_cpu(tcpu) {
- if (cpu == tcpu)
- break;
- if (group_map[tcpu] == group && cpu_distance_fn &&
- (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
- cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
- group++;
- nr_groups = max(nr_groups, group + 1);
- goto next_group;
- }
- }
- group_map[cpu] = group;
- group_cnt[group]++;
- }
-
- /*
- * Expand unit size until address space usage goes over 75%
- * and then as much as possible without using more address
- * space.
- */
- last_allocs = INT_MAX;
- for (upa = max_upa; upa; upa--) {
- int allocs = 0, wasted = 0;
-
- if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
- continue;
-
- for (group = 0; group < nr_groups; group++) {
- int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
- allocs += this_allocs;
- wasted += this_allocs * upa - group_cnt[group];
- }
-
- /*
- * Don't accept if wastage is over 25%. The
- * greater-than comparison ensures upa==1 always
- * passes the following check.
- */
- if (wasted > num_possible_cpus() / 3)
- continue;
-
- /* and then don't consume more memory */
- if (allocs > last_allocs)
- break;
- last_allocs = allocs;
- best_upa = upa;
- }
- upa = best_upa;
-
- /* allocate and fill alloc_info */
- for (group = 0; group < nr_groups; group++)
- nr_units += roundup(group_cnt[group], upa);
-
- ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
- if (!ai)
- return ERR_PTR(-ENOMEM);
- cpu_map = ai->groups[0].cpu_map;
-
- for (group = 0; group < nr_groups; group++) {
- ai->groups[group].cpu_map = cpu_map;
- cpu_map += roundup(group_cnt[group], upa);
- }
-
- ai->static_size = static_size;
- ai->reserved_size = reserved_size;
- ai->dyn_size = dyn_size;
- ai->unit_size = alloc_size / upa;
- ai->atom_size = atom_size;
- ai->alloc_size = alloc_size;
-
- for (group = 0, unit = 0; group_cnt[group]; group++) {
- struct pcpu_group_info *gi = &ai->groups[group];
-
- /*
- * Initialize base_offset as if all groups are located
- * back-to-back. The caller should update this to
- * reflect actual allocation.
- */
- gi->base_offset = unit * ai->unit_size;
-
- for_each_possible_cpu(cpu)
- if (group_map[cpu] == group)
- gi->cpu_map[gi->nr_units++] = cpu;
- gi->nr_units = roundup(gi->nr_units, upa);
- unit += gi->nr_units;
- }
- BUG_ON(unit != nr_units);
-
- return ai;
-}
-
-/**
* pcpu_dump_alloc_info - print out information about pcpu_alloc_info
* @lvl: loglevel
* @ai: allocation info to dump
@@ -1361,7 +1218,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
/* sanity checks */
PCPU_SETUP_BUG_ON(ai->nr_groups <= 0);
+#ifdef CONFIG_SMP
PCPU_SETUP_BUG_ON(!ai->static_size);
+#endif
PCPU_SETUP_BUG_ON(!base_addr);
PCPU_SETUP_BUG_ON(ai->unit_size < size_sum);
PCPU_SETUP_BUG_ON(ai->unit_size & ~PAGE_MASK);
@@ -1399,9 +1258,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
if (pcpu_first_unit_cpu == NR_CPUS)
pcpu_first_unit_cpu = cpu;
+ pcpu_last_unit_cpu = cpu;
}
}
- pcpu_last_unit_cpu = cpu;
pcpu_nr_units = unit;
for_each_possible_cpu(cpu)
@@ -1486,6 +1345,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
return 0;
}
+#ifdef CONFIG_SMP
+
const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
[PCPU_FC_AUTO] = "auto",
[PCPU_FC_EMBED] = "embed",
@@ -1513,8 +1374,180 @@ static int __init percpu_alloc_setup(char *str)
}
early_param("percpu_alloc", percpu_alloc_setup);
+/*
+ * pcpu_embed_first_chunk() is used by the generic percpu setup.
+ * Build it if needed by the arch config or the generic setup is going
+ * to be used.
+ */
#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
+#define BUILD_EMBED_FIRST_CHUNK
+#endif
+
+/* build pcpu_page_first_chunk() iff needed by the arch config */
+#if defined(CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK)
+#define BUILD_PAGE_FIRST_CHUNK
+#endif
+
+/* pcpu_build_alloc_info() is used by both embed and page first chunk */
+#if defined(BUILD_EMBED_FIRST_CHUNK) || defined(BUILD_PAGE_FIRST_CHUNK)
+/**
+ * pcpu_build_alloc_info - build alloc_info considering distances between CPUs
+ * @reserved_size: the size of reserved percpu area in bytes
+ * @dyn_size: minimum free size for dynamic allocation in bytes
+ * @atom_size: allocation atom size
+ * @cpu_distance_fn: callback to determine distance between cpus, optional
+ *
+ * This function determines grouping of units, their mappings to cpus
+ * and other parameters considering needed percpu size, allocation
+ * atom size and distances between CPUs.
+ *
+ * Groups are always mutliples of atom size and CPUs which are of
+ * LOCAL_DISTANCE both ways are grouped together and share space for
+ * units in the same group. The returned configuration is guaranteed
+ * to have CPUs on different nodes on different groups and >=75% usage
+ * of allocated virtual address space.
+ *
+ * RETURNS:
+ * On success, pointer to the new allocation_info is returned. On
+ * failure, ERR_PTR value is returned.
+ */
+static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
+ size_t reserved_size, size_t dyn_size,
+ size_t atom_size,
+ pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
+{
+ static int group_map[NR_CPUS] __initdata;
+ static int group_cnt[NR_CPUS] __initdata;
+ const size_t static_size = __per_cpu_end - __per_cpu_start;
+ int nr_groups = 1, nr_units = 0;
+ size_t size_sum, min_unit_size, alloc_size;
+ int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
+ int last_allocs, group, unit;
+ unsigned int cpu, tcpu;
+ struct pcpu_alloc_info *ai;
+ unsigned int *cpu_map;
+
+ /* this function may be called multiple times */
+ memset(group_map, 0, sizeof(group_map));
+ memset(group_cnt, 0, sizeof(group_cnt));
+
+ /* calculate size_sum and ensure dyn_size is enough for early alloc */
+ size_sum = PFN_ALIGN(static_size + reserved_size +
+ max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
+ dyn_size = size_sum - static_size - reserved_size;
+
+ /*
+ * Determine min_unit_size, alloc_size and max_upa such that
+ * alloc_size is multiple of atom_size and is the smallest
+ * which can accomodate 4k aligned segments which are equal to
+ * or larger than min_unit_size.
+ */
+ min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
+
+ alloc_size = roundup(min_unit_size, atom_size);
+ upa = alloc_size / min_unit_size;
+ while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ upa--;
+ max_upa = upa;
+
+ /* group cpus according to their proximity */
+ for_each_possible_cpu(cpu) {
+ group = 0;
+ next_group:
+ for_each_possible_cpu(tcpu) {
+ if (cpu == tcpu)
+ break;
+ if (group_map[tcpu] == group && cpu_distance_fn &&
+ (cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
+ cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
+ group++;
+ nr_groups = max(nr_groups, group + 1);
+ goto next_group;
+ }
+ }
+ group_map[cpu] = group;
+ group_cnt[group]++;
+ }
+
+ /*
+ * Expand unit size until address space usage goes over 75%
+ * and then as much as possible without using more address
+ * space.
+ */
+ last_allocs = INT_MAX;
+ for (upa = max_upa; upa; upa--) {
+ int allocs = 0, wasted = 0;
+
+ if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
+ continue;
+
+ for (group = 0; group < nr_groups; group++) {
+ int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
+ allocs += this_allocs;
+ wasted += this_allocs * upa - group_cnt[group];
+ }
+
+ /*
+ * Don't accept if wastage is over 1/3. The
+ * greater-than comparison ensures upa==1 always
+ * passes the following check.
+ */
+ if (wasted > num_possible_cpus() / 3)
+ continue;
+
+ /* and then don't consume more memory */
+ if (allocs > last_allocs)
+ break;
+ last_allocs = allocs;
+ best_upa = upa;
+ }
+ upa = best_upa;
+
+ /* allocate and fill alloc_info */
+ for (group = 0; group < nr_groups; group++)
+ nr_units += roundup(group_cnt[group], upa);
+
+ ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
+ if (!ai)
+ return ERR_PTR(-ENOMEM);
+ cpu_map = ai->groups[0].cpu_map;
+
+ for (group = 0; group < nr_groups; group++) {
+ ai->groups[group].cpu_map = cpu_map;
+ cpu_map += roundup(group_cnt[group], upa);
+ }
+
+ ai->static_size = static_size;
+ ai->reserved_size = reserved_size;
+ ai->dyn_size = dyn_size;
+ ai->unit_size = alloc_size / upa;
+ ai->atom_size = atom_size;
+ ai->alloc_size = alloc_size;
+
+ for (group = 0, unit = 0; group_cnt[group]; group++) {
+ struct pcpu_group_info *gi = &ai->groups[group];
+
+ /*
+ * Initialize base_offset as if all groups are located
+ * back-to-back. The caller should update this to
+ * reflect actual allocation.
+ */
+ gi->base_offset = unit * ai->unit_size;
+
+ for_each_possible_cpu(cpu)
+ if (group_map[cpu] == group)
+ gi->cpu_map[gi->nr_units++] = cpu;
+ gi->nr_units = roundup(gi->nr_units, upa);
+ unit += gi->nr_units;
+ }
+ BUG_ON(unit != nr_units);
+
+ return ai;
+}
+#endif /* BUILD_EMBED_FIRST_CHUNK || BUILD_PAGE_FIRST_CHUNK */
+
+#if defined(BUILD_EMBED_FIRST_CHUNK)
/**
* pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
* @reserved_size: the size of reserved percpu area in bytes
@@ -1643,10 +1676,9 @@ out_free:
free_bootmem(__pa(areas), areas_size);
return rc;
}
-#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
- !CONFIG_HAVE_SETUP_PER_CPU_AREA */
+#endif /* BUILD_EMBED_FIRST_CHUNK */
-#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
+#ifdef BUILD_PAGE_FIRST_CHUNK
/**
* pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
* @reserved_size: the size of reserved percpu area in bytes
@@ -1754,10 +1786,11 @@ out_free_ar:
pcpu_free_alloc_info(ai);
return rc;
}
-#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
+#endif /* BUILD_PAGE_FIRST_CHUNK */
+#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
/*
- * Generic percpu area setup.
+ * Generic SMP percpu area setup.
*
* The embedding helper is used because its behavior closely resembles
* the original non-dynamic generic percpu area setup. This is
@@ -1768,7 +1801,6 @@ out_free_ar:
* on the physical linear memory mapping which uses large page
* mappings on applicable archs.
*/
-#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
@@ -1797,13 +1829,48 @@ void __init setup_per_cpu_areas(void)
PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
if (rc < 0)
- panic("Failed to initialized percpu areas.");
+ panic("Failed to initialize percpu areas.");
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu)
__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
}
-#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
+#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */
+
+#else /* CONFIG_SMP */
+
+/*
+ * UP percpu area setup.
+ *
+ * UP always uses km-based percpu allocator with identity mapping.
+ * Static percpu variables are indistinguishable from the usual static
+ * variables and don't require any special preparation.
+ */
+void __init setup_per_cpu_areas(void)
+{
+ const size_t unit_size =
+ roundup_pow_of_two(max_t(size_t, PCPU_MIN_UNIT_SIZE,
+ PERCPU_DYNAMIC_RESERVE));
+ struct pcpu_alloc_info *ai;
+ void *fc;
+
+ ai = pcpu_alloc_alloc_info(1, 1);
+ fc = __alloc_bootmem(unit_size, PAGE_SIZE, __pa(MAX_DMA_ADDRESS));
+ if (!ai || !fc)
+ panic("Failed to allocate memory for percpu areas.");
+
+ ai->dyn_size = unit_size;
+ ai->unit_size = unit_size;
+ ai->atom_size = unit_size;
+ ai->alloc_size = unit_size;
+ ai->groups[0].nr_units = 1;
+ ai->groups[0].cpu_map[0] = 0;
+
+ if (pcpu_setup_first_chunk(ai, fc) < 0)
+ panic("Failed to initialize percpu areas.");
+}
+
+#endif /* CONFIG_SMP */
/*
* First and reserved chunks are initialized with temporary allocation
diff --git a/mm/percpu_up.c b/mm/percpu_up.c
deleted file mode 100644
index c4351c7..0000000
--- a/mm/percpu_up.c
+++ /dev/null
@@ -1,30 +0,0 @@
-/*
- * mm/percpu_up.c - dummy percpu memory allocator implementation for UP
- */
-
-#include <linux/module.h>
-#include <linux/percpu.h>
-#include <linux/slab.h>
-
-void __percpu *__alloc_percpu(size_t size, size_t align)
-{
- /*
- * Can't easily make larger alignment work with kmalloc. WARN
- * on it. Larger alignment should only be used for module
- * percpu sections on SMP for which this path isn't used.
- */
- WARN_ON_ONCE(align > SMP_CACHE_BYTES);
- return kzalloc(size, GFP_KERNEL);
-}
-EXPORT_SYMBOL_GPL(__alloc_percpu);
-
-void free_percpu(void __percpu *p)
-{
- kfree(p);
-}
-EXPORT_SYMBOL_GPL(free_percpu);
-
-phys_addr_t per_cpu_ptr_to_phys(void *addr)
-{
- return __pa(addr);
-}
diff --git a/mm/rmap.c b/mm/rmap.c
index a7d0f54..92e6757 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -56,6 +56,7 @@
#include <linux/memcontrol.h>
#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
+#include <linux/hugetlb.h>
#include <asm/tlbflush.h>
@@ -315,7 +316,7 @@ void __init anon_vma_init(void)
*/
struct anon_vma *page_lock_anon_vma(struct page *page)
{
- struct anon_vma *anon_vma;
+ struct anon_vma *anon_vma, *root_anon_vma;
unsigned long anon_mapping;
rcu_read_lock();
@@ -326,8 +327,21 @@ struct anon_vma *page_lock_anon_vma(struct page *page)
goto out;
anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON);
- anon_vma_lock(anon_vma);
- return anon_vma;
+ root_anon_vma = ACCESS_ONCE(anon_vma->root);
+ spin_lock(&root_anon_vma->lock);
+
+ /*
+ * If this page is still mapped, then its anon_vma cannot have been
+ * freed. But if it has been unmapped, we have no security against
+ * the anon_vma structure being freed and reused (for another anon_vma:
+ * SLAB_DESTROY_BY_RCU guarantees that - so the spin_lock above cannot
+ * corrupt): with anon_vma_prepare() or anon_vma_fork() redirecting
+ * anon_vma->root before page_unlock_anon_vma() is called to unlock.
+ */
+ if (page_mapped(page))
+ return anon_vma;
+
+ spin_unlock(&root_anon_vma->lock);
out:
rcu_read_unlock();
return NULL;
@@ -350,6 +364,8 @@ vma_address(struct page *page, struct vm_area_struct *vma)
pgoff_t pgoff = page->index << (PAGE_CACHE_SHIFT - PAGE_SHIFT);
unsigned long address;
+ if (unlikely(is_vm_hugetlb_page(vma)))
+ pgoff = page->index << huge_page_order(page_hstate(page));
address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT);
if (unlikely(address < vma->vm_start || address >= vma->vm_end)) {
/* page should be within @vma mapping range */
@@ -365,7 +381,13 @@ vma_address(struct page *page, struct vm_area_struct *vma)
unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
{
if (PageAnon(page)) {
- if (vma->anon_vma->root != page_anon_vma(page)->root)
+ struct anon_vma *page__anon_vma = page_anon_vma(page);
+ /*
+ * Note: swapoff's unuse_vma() is more efficient with this
+ * check, and needs it to match anon_vma when KSM is active.
+ */
+ if (!vma->anon_vma || !page__anon_vma ||
+ vma->anon_vma->root != page__anon_vma->root)
return -EFAULT;
} else if (page->mapping && !(vma->vm_flags & VM_NONLINEAR)) {
if (!vma->vm_file ||
@@ -394,6 +416,12 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
pte_t *pte;
spinlock_t *ptl;
+ if (unlikely(PageHuge(page))) {
+ pte = huge_pte_offset(mm, address);
+ ptl = &mm->page_table_lock;
+ goto check;
+ }
+
pgd = pgd_offset(mm, address);
if (!pgd_present(*pgd))
return NULL;
@@ -414,6 +442,7 @@ pte_t *page_check_address(struct page *page, struct mm_struct *mm,
}
ptl = pte_lockptr(mm, pmd);
+check:
spin_lock(ptl);
if (pte_present(*pte) && page_to_pfn(page) == pte_pfn(*pte)) {
*ptlp = ptl;
@@ -916,6 +945,12 @@ void page_remove_rmap(struct page *page)
page_clear_dirty(page);
set_page_dirty(page);
}
+ /*
+ * Hugepages are not counted in NR_ANON_PAGES nor NR_FILE_MAPPED
+ * and not charged by memcg for now.
+ */
+ if (unlikely(PageHuge(page)))
+ return;
if (PageAnon(page)) {
mem_cgroup_uncharge_page(page);
__dec_zone_page_state(page, NR_ANON_PAGES);
@@ -1524,3 +1559,49 @@ int rmap_walk(struct page *page, int (*rmap_one)(struct page *,
return rmap_walk_file(page, rmap_one, arg);
}
#endif /* CONFIG_MIGRATION */
+
+#ifdef CONFIG_HUGETLB_PAGE
+/*
+ * The following three functions are for anonymous (private mapped) hugepages.
+ * Unlike common anonymous pages, anonymous hugepages have no accounting code
+ * and no lru code, because we handle hugepages differently from common pages.
+ */
+static void __hugepage_set_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address, int exclusive)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+
+ BUG_ON(!anon_vma);
+
+ if (PageAnon(page))
+ return;
+ if (!exclusive)
+ anon_vma = anon_vma->root;
+
+ anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
+ page->mapping = (struct address_space *) anon_vma;
+ page->index = linear_page_index(vma, address);
+}
+
+void hugepage_add_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ struct anon_vma *anon_vma = vma->anon_vma;
+ int first;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(!anon_vma);
+ BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ first = atomic_inc_and_test(&page->_mapcount);
+ if (first)
+ __hugepage_set_anon_rmap(page, vma, address, 0);
+}
+
+void hugepage_add_new_anon_rmap(struct page *page,
+ struct vm_area_struct *vma, unsigned long address)
+{
+ BUG_ON(address < vma->vm_start || address >= vma->vm_end);
+ atomic_set(&page->_mapcount, 0);
+ __hugepage_set_anon_rmap(page, vma, address, 1);
+}
+#endif /* CONFIG_HUGETLB_PAGE */
diff --git a/mm/shmem.c b/mm/shmem.c
index dfaa0f4..080b09a 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -2325,7 +2325,10 @@ static int shmem_show_options(struct seq_file *seq, struct vfsmount *vfs)
static void shmem_put_super(struct super_block *sb)
{
- kfree(sb->s_fs_info);
+ struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+
+ percpu_counter_destroy(&sbinfo->used_blocks);
+ kfree(sbinfo);
sb->s_fs_info = NULL;
}
@@ -2367,7 +2370,8 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent)
#endif
spin_lock_init(&sbinfo->stat_lock);
- percpu_counter_init(&sbinfo->used_blocks, 0);
+ if (percpu_counter_init(&sbinfo->used_blocks, 0))
+ goto failed;
sbinfo->free_inodes = sbinfo->max_inodes;
sb->s_maxbytes = SHMEM_MAX_BYTES;
diff --git a/mm/slab.c b/mm/slab.c
index 88435fc..fcae981 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2330,8 +2330,8 @@ kmem_cache_create (const char *name, size_t size, size_t align,
}
#if FORCED_DEBUG && defined(CONFIG_DEBUG_PAGEALLOC)
if (size >= malloc_sizes[INDEX_L3 + 1].cs_size
- && cachep->obj_size > cache_line_size() && size < PAGE_SIZE) {
- cachep->obj_offset += PAGE_SIZE - size;
+ && cachep->obj_size > cache_line_size() && ALIGN(size, align) < PAGE_SIZE) {
+ cachep->obj_offset += PAGE_SIZE - ALIGN(size, align);
size = PAGE_SIZE;
}
#endif
diff --git a/mm/slob.c b/mm/slob.c
index d582171..617b6d6 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -500,7 +500,9 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
} else {
unsigned int order = get_order(size);
- ret = slob_new_pages(gfp | __GFP_COMP, get_order(size), node);
+ if (likely(order))
+ gfp |= __GFP_COMP;
+ ret = slob_new_pages(gfp, order, node);
if (ret) {
struct page *page;
page = virt_to_page(ret);
diff --git a/mm/slub.c b/mm/slub.c
index 13fffe1..8fd5401 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -168,7 +168,6 @@ static inline int kmem_cache_debug(struct kmem_cache *s)
/* Internal SLUB flags */
#define __OBJECT_POISON 0x80000000UL /* Poison object */
-#define __SYSFS_ADD_DEFERRED 0x40000000UL /* Not yet visible via sysfs */
static int kmem_size = sizeof(struct kmem_cache);
@@ -178,7 +177,7 @@ static struct notifier_block slab_notifier;
static enum {
DOWN, /* No slab functionality available */
- PARTIAL, /* kmem_cache_open() works but kmalloc does not */
+ PARTIAL, /* Kmem_cache_node works */
UP, /* Everything works but does not show up in sysfs */
SYSFS /* Sysfs up */
} slab_state = DOWN;
@@ -199,7 +198,7 @@ struct track {
enum track_item { TRACK_ALLOC, TRACK_FREE };
-#ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_SYSFS
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
static void sysfs_slab_remove(struct kmem_cache *);
@@ -210,6 +209,7 @@ static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
{ return 0; }
static inline void sysfs_slab_remove(struct kmem_cache *s)
{
+ kfree(s->name);
kfree(s);
}
@@ -233,11 +233,7 @@ int slab_is_available(void)
static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
{
-#ifdef CONFIG_NUMA
return s->node[node];
-#else
- return &s->local_node;
-#endif
}
/* Verify that a pointer has an address that is valid within a slab page */
@@ -494,7 +490,7 @@ static void slab_err(struct kmem_cache *s, struct page *page, char *fmt, ...)
dump_stack();
}
-static void init_object(struct kmem_cache *s, void *object, int active)
+static void init_object(struct kmem_cache *s, void *object, u8 val)
{
u8 *p = object;
@@ -504,9 +500,7 @@ static void init_object(struct kmem_cache *s, void *object, int active)
}
if (s->flags & SLAB_RED_ZONE)
- memset(p + s->objsize,
- active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE,
- s->inuse - s->objsize);
+ memset(p + s->objsize, val, s->inuse - s->objsize);
}
static u8 *check_bytes(u8 *start, unsigned int value, unsigned int bytes)
@@ -641,17 +635,14 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
}
static int check_object(struct kmem_cache *s, struct page *page,
- void *object, int active)
+ void *object, u8 val)
{
u8 *p = object;
u8 *endobject = object + s->objsize;
if (s->flags & SLAB_RED_ZONE) {
- unsigned int red =
- active ? SLUB_RED_ACTIVE : SLUB_RED_INACTIVE;
-
if (!check_bytes_and_report(s, page, object, "Redzone",
- endobject, red, s->inuse - s->objsize))
+ endobject, val, s->inuse - s->objsize))
return 0;
} else {
if ((s->flags & SLAB_POISON) && s->objsize < s->inuse) {
@@ -661,7 +652,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
}
if (s->flags & SLAB_POISON) {
- if (!active && (s->flags & __OBJECT_POISON) &&
+ if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) &&
(!check_bytes_and_report(s, page, p, "Poison", p,
POISON_FREE, s->objsize - 1) ||
!check_bytes_and_report(s, page, p, "Poison",
@@ -673,7 +664,7 @@ static int check_object(struct kmem_cache *s, struct page *page,
check_pad_bytes(s, page, p);
}
- if (!s->offset && active)
+ if (!s->offset && val == SLUB_RED_ACTIVE)
/*
* Object and freepointer overlap. Cannot check
* freepointer while object is allocated.
@@ -792,6 +783,39 @@ static void trace(struct kmem_cache *s, struct page *page, void *object,
}
/*
+ * Hooks for other subsystems that check memory allocations. In a typical
+ * production configuration these hooks all should produce no code at all.
+ */
+static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
+{
+ flags &= gfp_allowed_mask;
+ lockdep_trace_alloc(flags);
+ might_sleep_if(flags & __GFP_WAIT);
+
+ return should_failslab(s->objsize, flags, s->flags);
+}
+
+static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags, void *object)
+{
+ flags &= gfp_allowed_mask;
+ kmemcheck_slab_alloc(s, flags, object, s->objsize);
+ kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, flags);
+}
+
+static inline void slab_free_hook(struct kmem_cache *s, void *x)
+{
+ kmemleak_free_recursive(x, s->flags);
+}
+
+static inline void slab_free_hook_irq(struct kmem_cache *s, void *object)
+{
+ kmemcheck_slab_free(s, object, s->objsize);
+ debug_check_no_locks_freed(object, s->objsize);
+ if (!(s->flags & SLAB_DEBUG_OBJECTS))
+ debug_check_no_obj_freed(object, s->objsize);
+}
+
+/*
* Tracking of fully allocated slabs for debugging purposes.
*/
static void add_full(struct kmem_cache_node *n, struct page *page)
@@ -838,7 +862,7 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node, int objects)
* dilemma by deferring the increment of the count during
* bootstrap (see early_kmem_cache_node_alloc).
*/
- if (!NUMA_BUILD || n) {
+ if (n) {
atomic_long_inc(&n->nr_slabs);
atomic_long_add(objects, &n->total_objects);
}
@@ -858,11 +882,11 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
return;
- init_object(s, object, 0);
+ init_object(s, object, SLUB_RED_INACTIVE);
init_tracking(s, object);
}
-static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
+static noinline int alloc_debug_processing(struct kmem_cache *s, struct page *page,
void *object, unsigned long addr)
{
if (!check_slab(s, page))
@@ -878,14 +902,14 @@ static int alloc_debug_processing(struct kmem_cache *s, struct page *page,
goto bad;
}
- if (!check_object(s, page, object, 0))
+ if (!check_object(s, page, object, SLUB_RED_INACTIVE))
goto bad;
/* Success perform special debug activities for allocs */
if (s->flags & SLAB_STORE_USER)
set_track(s, object, TRACK_ALLOC, addr);
trace(s, page, object, 1);
- init_object(s, object, 1);
+ init_object(s, object, SLUB_RED_ACTIVE);
return 1;
bad:
@@ -902,8 +926,8 @@ bad:
return 0;
}
-static int free_debug_processing(struct kmem_cache *s, struct page *page,
- void *object, unsigned long addr)
+static noinline int free_debug_processing(struct kmem_cache *s,
+ struct page *page, void *object, unsigned long addr)
{
if (!check_slab(s, page))
goto fail;
@@ -918,7 +942,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
goto fail;
}
- if (!check_object(s, page, object, 1))
+ if (!check_object(s, page, object, SLUB_RED_ACTIVE))
return 0;
if (unlikely(s != page->slab)) {
@@ -942,7 +966,7 @@ static int free_debug_processing(struct kmem_cache *s, struct page *page,
if (s->flags & SLAB_STORE_USER)
set_track(s, object, TRACK_FREE, addr);
trace(s, page, object, 0);
- init_object(s, object, 0);
+ init_object(s, object, SLUB_RED_INACTIVE);
return 1;
fail:
@@ -1046,7 +1070,7 @@ static inline int free_debug_processing(struct kmem_cache *s,
static inline int slab_pad_check(struct kmem_cache *s, struct page *page)
{ return 1; }
static inline int check_object(struct kmem_cache *s, struct page *page,
- void *object, int active) { return 1; }
+ void *object, u8 val) { return 1; }
static inline void add_full(struct kmem_cache_node *n, struct page *page) {}
static inline unsigned long kmem_cache_flags(unsigned long objsize,
unsigned long flags, const char *name,
@@ -1066,7 +1090,19 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
int objects) {}
static inline void dec_slabs_node(struct kmem_cache *s, int node,
int objects) {}
-#endif
+
+static inline int slab_pre_alloc_hook(struct kmem_cache *s, gfp_t flags)
+ { return 0; }
+
+static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
+ void *object) {}
+
+static inline void slab_free_hook(struct kmem_cache *s, void *x) {}
+
+static inline void slab_free_hook_irq(struct kmem_cache *s,
+ void *object) {}
+
+#endif /* CONFIG_SLUB_DEBUG */
/*
* Slab allocation and freeing
@@ -1194,7 +1230,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
slab_pad_check(s, page);
for_each_object(p, s, page_address(page),
page->objects)
- check_object(s, page, p, 0);
+ check_object(s, page, p, SLUB_RED_INACTIVE);
}
kmemcheck_free_shadow(page, compound_order(page));
@@ -1274,13 +1310,19 @@ static void add_partial(struct kmem_cache_node *n,
spin_unlock(&n->list_lock);
}
+static inline void __remove_partial(struct kmem_cache_node *n,
+ struct page *page)
+{
+ list_del(&page->lru);
+ n->nr_partial--;
+}
+
static void remove_partial(struct kmem_cache *s, struct page *page)
{
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
spin_lock(&n->list_lock);
- list_del(&page->lru);
- n->nr_partial--;
+ __remove_partial(n, page);
spin_unlock(&n->list_lock);
}
@@ -1293,8 +1335,7 @@ static inline int lock_and_freeze_slab(struct kmem_cache_node *n,
struct page *page)
{
if (slab_trylock(page)) {
- list_del(&page->lru);
- n->nr_partial--;
+ __remove_partial(n, page);
__SetPageSlubFrozen(page);
return 1;
}
@@ -1405,6 +1446,7 @@ static struct page *get_partial(struct kmem_cache *s, gfp_t flags, int node)
* On exit the slab lock will have been dropped.
*/
static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
+ __releases(bitlock)
{
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
@@ -1447,6 +1489,7 @@ static void unfreeze_slab(struct kmem_cache *s, struct page *page, int tail)
* Remove the cpu slab
*/
static void deactivate_slab(struct kmem_cache *s, struct kmem_cache_cpu *c)
+ __releases(bitlock)
{
struct page *page = c->page;
int tail = 1;
@@ -1647,6 +1690,7 @@ new_slab:
goto load_freelist;
}
+ gfpflags &= gfp_allowed_mask;
if (gfpflags & __GFP_WAIT)
local_irq_enable();
@@ -1674,7 +1718,7 @@ debug:
c->page->inuse++;
c->page->freelist = get_freepointer(s, object);
- c->node = -1;
+ c->node = NUMA_NO_NODE;
goto unlock_out;
}
@@ -1695,12 +1739,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
struct kmem_cache_cpu *c;
unsigned long flags;
- gfpflags &= gfp_allowed_mask;
-
- lockdep_trace_alloc(gfpflags);
- might_sleep_if(gfpflags & __GFP_WAIT);
-
- if (should_failslab(s->objsize, gfpflags, s->flags))
+ if (slab_pre_alloc_hook(s, gfpflags))
return NULL;
local_irq_save(flags);
@@ -1719,8 +1758,7 @@ static __always_inline void *slab_alloc(struct kmem_cache *s,
if (unlikely(gfpflags & __GFP_ZERO) && object)
memset(object, 0, s->objsize);
- kmemcheck_slab_alloc(s, gfpflags, object, s->objsize);
- kmemleak_alloc_recursive(object, s->objsize, 1, s->flags, gfpflags);
+ slab_post_alloc_hook(s, gfpflags, object);
return object;
}
@@ -1754,7 +1792,6 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
return ret;
}
EXPORT_SYMBOL(kmem_cache_alloc_node);
-#endif
#ifdef CONFIG_TRACING
void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
@@ -1765,6 +1802,7 @@ void *kmem_cache_alloc_node_notrace(struct kmem_cache *s,
}
EXPORT_SYMBOL(kmem_cache_alloc_node_notrace);
#endif
+#endif
/*
* Slow patch handling. This may still be called frequently since objects
@@ -1850,14 +1888,14 @@ static __always_inline void slab_free(struct kmem_cache *s,
struct kmem_cache_cpu *c;
unsigned long flags;
- kmemleak_free_recursive(x, s->flags);
+ slab_free_hook(s, x);
+
local_irq_save(flags);
c = __this_cpu_ptr(s->cpu_slab);
- kmemcheck_slab_free(s, object, s->objsize);
- debug_check_no_locks_freed(object, s->objsize);
- if (!(s->flags & SLAB_DEBUG_OBJECTS))
- debug_check_no_obj_freed(object, s->objsize);
- if (likely(page == c->page && c->node >= 0)) {
+
+ slab_free_hook_irq(s, x);
+
+ if (likely(page == c->page && c->node != NUMA_NO_NODE)) {
set_freepointer(s, object, c->freelist);
c->freelist = object;
stat(s, FREE_FASTPATH);
@@ -2062,26 +2100,18 @@ init_kmem_cache_node(struct kmem_cache_node *n, struct kmem_cache *s)
#endif
}
-static DEFINE_PER_CPU(struct kmem_cache_cpu, kmalloc_percpu[KMALLOC_CACHES]);
-
-static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
+static inline int alloc_kmem_cache_cpus(struct kmem_cache *s)
{
- if (s < kmalloc_caches + KMALLOC_CACHES && s >= kmalloc_caches)
- /*
- * Boot time creation of the kmalloc array. Use static per cpu data
- * since the per cpu allocator is not available yet.
- */
- s->cpu_slab = kmalloc_percpu + (s - kmalloc_caches);
- else
- s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
+ BUILD_BUG_ON(PERCPU_DYNAMIC_EARLY_SIZE <
+ SLUB_PAGE_SHIFT * sizeof(struct kmem_cache_cpu));
- if (!s->cpu_slab)
- return 0;
+ s->cpu_slab = alloc_percpu(struct kmem_cache_cpu);
- return 1;
+ return s->cpu_slab != NULL;
}
-#ifdef CONFIG_NUMA
+static struct kmem_cache *kmem_cache_node;
+
/*
* No kmalloc_node yet so do it by hand. We know that this is the first
* slab on the node for this slabcache. There are no concurrent accesses
@@ -2091,15 +2121,15 @@ static inline int alloc_kmem_cache_cpus(struct kmem_cache *s, gfp_t flags)
* when allocating for the kmalloc_node_cache. This is used for bootstrapping
* memory on a fresh node that has no slab structures yet.
*/
-static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node)
+static void early_kmem_cache_node_alloc(int node)
{
struct page *page;
struct kmem_cache_node *n;
unsigned long flags;
- BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
+ BUG_ON(kmem_cache_node->size < sizeof(struct kmem_cache_node));
- page = new_slab(kmalloc_caches, gfpflags, node);
+ page = new_slab(kmem_cache_node, GFP_NOWAIT, node);
BUG_ON(!page);
if (page_to_nid(page) != node) {
@@ -2111,15 +2141,15 @@ static void early_kmem_cache_node_alloc(gfp_t gfpflags, int node)
n = page->freelist;
BUG_ON(!n);
- page->freelist = get_freepointer(kmalloc_caches, n);
+ page->freelist = get_freepointer(kmem_cache_node, n);
page->inuse++;
- kmalloc_caches->node[node] = n;
+ kmem_cache_node->node[node] = n;
#ifdef CONFIG_SLUB_DEBUG
- init_object(kmalloc_caches, n, 1);
- init_tracking(kmalloc_caches, n);
+ init_object(kmem_cache_node, n, SLUB_RED_ACTIVE);
+ init_tracking(kmem_cache_node, n);
#endif
- init_kmem_cache_node(n, kmalloc_caches);
- inc_slabs_node(kmalloc_caches, node, page->objects);
+ init_kmem_cache_node(n, kmem_cache_node);
+ inc_slabs_node(kmem_cache_node, node, page->objects);
/*
* lockdep requires consistent irq usage for each lock
@@ -2137,13 +2167,15 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = s->node[node];
+
if (n)
- kmem_cache_free(kmalloc_caches, n);
+ kmem_cache_free(kmem_cache_node, n);
+
s->node[node] = NULL;
}
}
-static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
+static int init_kmem_cache_nodes(struct kmem_cache *s)
{
int node;
@@ -2151,11 +2183,11 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
struct kmem_cache_node *n;
if (slab_state == DOWN) {
- early_kmem_cache_node_alloc(gfpflags, node);
+ early_kmem_cache_node_alloc(node);
continue;
}
- n = kmem_cache_alloc_node(kmalloc_caches,
- gfpflags, node);
+ n = kmem_cache_alloc_node(kmem_cache_node,
+ GFP_KERNEL, node);
if (!n) {
free_kmem_cache_nodes(s);
@@ -2167,17 +2199,6 @@ static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
}
return 1;
}
-#else
-static void free_kmem_cache_nodes(struct kmem_cache *s)
-{
-}
-
-static int init_kmem_cache_nodes(struct kmem_cache *s, gfp_t gfpflags)
-{
- init_kmem_cache_node(&s->local_node, s);
- return 1;
-}
-#endif
static void set_min_partial(struct kmem_cache *s, unsigned long min)
{
@@ -2312,7 +2333,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
}
-static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
+static int kmem_cache_open(struct kmem_cache *s,
const char *name, size_t size,
size_t align, unsigned long flags,
void (*ctor)(void *))
@@ -2348,10 +2369,10 @@ static int kmem_cache_open(struct kmem_cache *s, gfp_t gfpflags,
#ifdef CONFIG_NUMA
s->remote_node_defrag_ratio = 1000;
#endif
- if (!init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
+ if (!init_kmem_cache_nodes(s))
goto error;
- if (alloc_kmem_cache_cpus(s, gfpflags & ~SLUB_DMA))
+ if (alloc_kmem_cache_cpus(s))
return 1;
free_kmem_cache_nodes(s);
@@ -2414,9 +2435,8 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
#ifdef CONFIG_SLUB_DEBUG
void *addr = page_address(page);
void *p;
- long *map = kzalloc(BITS_TO_LONGS(page->objects) * sizeof(long),
- GFP_ATOMIC);
-
+ unsigned long *map = kzalloc(BITS_TO_LONGS(page->objects) *
+ sizeof(long), GFP_ATOMIC);
if (!map)
return;
slab_err(s, page, "%s", text);
@@ -2448,9 +2468,8 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
spin_lock_irqsave(&n->list_lock, flags);
list_for_each_entry_safe(page, h, &n->partial, lru) {
if (!page->inuse) {
- list_del(&page->lru);
+ __remove_partial(n, page);
discard_slab(s, page);
- n->nr_partial--;
} else {
list_slab_objects(s, page,
"Objects remaining on kmem_cache_close()");
@@ -2507,9 +2526,15 @@ EXPORT_SYMBOL(kmem_cache_destroy);
* Kmalloc subsystem
*******************************************************************/
-struct kmem_cache kmalloc_caches[KMALLOC_CACHES] __cacheline_aligned;
+struct kmem_cache *kmalloc_caches[SLUB_PAGE_SHIFT];
EXPORT_SYMBOL(kmalloc_caches);
+static struct kmem_cache *kmem_cache;
+
+#ifdef CONFIG_ZONE_DMA
+static struct kmem_cache *kmalloc_dma_caches[SLUB_PAGE_SHIFT];
+#endif
+
static int __init setup_slub_min_order(char *str)
{
get_option(&str, &slub_min_order);
@@ -2546,116 +2571,29 @@ static int __init setup_slub_nomerge(char *str)
__setup("slub_nomerge", setup_slub_nomerge);
-static struct kmem_cache *create_kmalloc_cache(struct kmem_cache *s,
- const char *name, int size, gfp_t gfp_flags)
+static struct kmem_cache *__init create_kmalloc_cache(const char *name,
+ int size, unsigned int flags)
{
- unsigned int flags = 0;
+ struct kmem_cache *s;
- if (gfp_flags & SLUB_DMA)
- flags = SLAB_CACHE_DMA;
+ s = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
/*
* This function is called with IRQs disabled during early-boot on
* single CPU so there's no need to take slub_lock here.
*/
- if (!kmem_cache_open(s, gfp_flags, name, size, ARCH_KMALLOC_MINALIGN,
+ if (!kmem_cache_open(s, name, size, ARCH_KMALLOC_MINALIGN,
flags, NULL))
goto panic;
list_add(&s->list, &slab_caches);
-
- if (sysfs_slab_add(s))
- goto panic;
return s;
panic:
panic("Creation of kmalloc slab %s size=%d failed.\n", name, size);
+ return NULL;
}
-#ifdef CONFIG_ZONE_DMA
-static struct kmem_cache *kmalloc_caches_dma[SLUB_PAGE_SHIFT];
-
-static void sysfs_add_func(struct work_struct *w)
-{
- struct kmem_cache *s;
-
- down_write(&slub_lock);
- list_for_each_entry(s, &slab_caches, list) {
- if (s->flags & __SYSFS_ADD_DEFERRED) {
- s->flags &= ~__SYSFS_ADD_DEFERRED;
- sysfs_slab_add(s);
- }
- }
- up_write(&slub_lock);
-}
-
-static DECLARE_WORK(sysfs_add_work, sysfs_add_func);
-
-static noinline struct kmem_cache *dma_kmalloc_cache(int index, gfp_t flags)
-{
- struct kmem_cache *s;
- char *text;
- size_t realsize;
- unsigned long slabflags;
- int i;
-
- s = kmalloc_caches_dma[index];
- if (s)
- return s;
-
- /* Dynamically create dma cache */
- if (flags & __GFP_WAIT)
- down_write(&slub_lock);
- else {
- if (!down_write_trylock(&slub_lock))
- goto out;
- }
-
- if (kmalloc_caches_dma[index])
- goto unlock_out;
-
- realsize = kmalloc_caches[index].objsize;
- text = kasprintf(flags & ~SLUB_DMA, "kmalloc_dma-%d",
- (unsigned int)realsize);
-
- s = NULL;
- for (i = 0; i < KMALLOC_CACHES; i++)
- if (!kmalloc_caches[i].size)
- break;
-
- BUG_ON(i >= KMALLOC_CACHES);
- s = kmalloc_caches + i;
-
- /*
- * Must defer sysfs creation to a workqueue because we don't know
- * what context we are called from. Before sysfs comes up, we don't
- * need to do anything because our sysfs initcall will start by
- * adding all existing slabs to sysfs.
- */
- slabflags = SLAB_CACHE_DMA|SLAB_NOTRACK;
- if (slab_state >= SYSFS)
- slabflags |= __SYSFS_ADD_DEFERRED;
-
- if (!text || !kmem_cache_open(s, flags, text,
- realsize, ARCH_KMALLOC_MINALIGN, slabflags, NULL)) {
- s->size = 0;
- kfree(text);
- goto unlock_out;
- }
-
- list_add(&s->list, &slab_caches);
- kmalloc_caches_dma[index] = s;
-
- if (slab_state >= SYSFS)
- schedule_work(&sysfs_add_work);
-
-unlock_out:
- up_write(&slub_lock);
-out:
- return kmalloc_caches_dma[index];
-}
-#endif
-
/*
* Conversion table for small slabs sizes / 8 to the index in the
* kmalloc array. This is necessary for slabs < 192 since we have non power
@@ -2708,10 +2646,10 @@ static struct kmem_cache *get_slab(size_t size, gfp_t flags)
#ifdef CONFIG_ZONE_DMA
if (unlikely((flags & SLUB_DMA)))
- return dma_kmalloc_cache(index, flags);
+ return kmalloc_dma_caches[index];
#endif
- return &kmalloc_caches[index];
+ return kmalloc_caches[index];
}
void *__kmalloc(size_t size, gfp_t flags)
@@ -2735,6 +2673,7 @@ void *__kmalloc(size_t size, gfp_t flags)
}
EXPORT_SYMBOL(__kmalloc);
+#ifdef CONFIG_NUMA
static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
{
struct page *page;
@@ -2749,7 +2688,6 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
return ptr;
}
-#ifdef CONFIG_NUMA
void *__kmalloc_node(size_t size, gfp_t flags, int node)
{
struct kmem_cache *s;
@@ -2889,8 +2827,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
* may have freed the last object and be
* waiting to release the slab.
*/
- list_del(&page->lru);
- n->nr_partial--;
+ __remove_partial(n, page);
slab_unlock(page);
discard_slab(s, page);
} else {
@@ -2914,7 +2851,7 @@ int kmem_cache_shrink(struct kmem_cache *s)
}
EXPORT_SYMBOL(kmem_cache_shrink);
-#if defined(CONFIG_NUMA) && defined(CONFIG_MEMORY_HOTPLUG)
+#if defined(CONFIG_MEMORY_HOTPLUG)
static int slab_mem_going_offline_callback(void *arg)
{
struct kmem_cache *s;
@@ -2956,7 +2893,7 @@ static void slab_mem_offline_callback(void *arg)
BUG_ON(slabs_node(s, offline_node));
s->node[offline_node] = NULL;
- kmem_cache_free(kmalloc_caches, n);
+ kmem_cache_free(kmem_cache_node, n);
}
}
up_read(&slub_lock);
@@ -2989,7 +2926,7 @@ static int slab_mem_going_online_callback(void *arg)
* since memory is not yet available from the node that
* is brought up.
*/
- n = kmem_cache_alloc(kmalloc_caches, GFP_KERNEL);
+ n = kmem_cache_alloc(kmem_cache_node, GFP_KERNEL);
if (!n) {
ret = -ENOMEM;
goto out;
@@ -3035,46 +2972,92 @@ static int slab_memory_callback(struct notifier_block *self,
* Basic setup of slabs
*******************************************************************/
+/*
+ * Used for early kmem_cache structures that were allocated using
+ * the page allocator
+ */
+
+static void __init kmem_cache_bootstrap_fixup(struct kmem_cache *s)
+{
+ int node;
+
+ list_add(&s->list, &slab_caches);
+ s->refcount = -1;
+
+ for_each_node_state(node, N_NORMAL_MEMORY) {
+ struct kmem_cache_node *n = get_node(s, node);
+ struct page *p;
+
+ if (n) {
+ list_for_each_entry(p, &n->partial, lru)
+ p->slab = s;
+
+#ifdef CONFIG_SLAB_DEBUG
+ list_for_each_entry(p, &n->full, lru)
+ p->slab = s;
+#endif
+ }
+ }
+}
+
void __init kmem_cache_init(void)
{
int i;
int caches = 0;
+ struct kmem_cache *temp_kmem_cache;
+ int order;
+ struct kmem_cache *temp_kmem_cache_node;
+ unsigned long kmalloc_size;
+
+ kmem_size = offsetof(struct kmem_cache, node) +
+ nr_node_ids * sizeof(struct kmem_cache_node *);
+
+ /* Allocate two kmem_caches from the page allocator */
+ kmalloc_size = ALIGN(kmem_size, cache_line_size());
+ order = get_order(2 * kmalloc_size);
+ kmem_cache = (void *)__get_free_pages(GFP_NOWAIT, order);
-#ifdef CONFIG_NUMA
/*
* Must first have the slab cache available for the allocations of the
* struct kmem_cache_node's. There is special bootstrap code in
* kmem_cache_open for slab_state == DOWN.
*/
- create_kmalloc_cache(&kmalloc_caches[0], "kmem_cache_node",
- sizeof(struct kmem_cache_node), GFP_NOWAIT);
- kmalloc_caches[0].refcount = -1;
- caches++;
+ kmem_cache_node = (void *)kmem_cache + kmalloc_size;
+
+ kmem_cache_open(kmem_cache_node, "kmem_cache_node",
+ sizeof(struct kmem_cache_node),
+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
hotplug_memory_notifier(slab_memory_callback, SLAB_CALLBACK_PRI);
-#endif
/* Able to allocate the per node structures */
slab_state = PARTIAL;
- /* Caches that are not of the two-to-the-power-of size */
- if (KMALLOC_MIN_SIZE <= 32) {
- create_kmalloc_cache(&kmalloc_caches[1],
- "kmalloc-96", 96, GFP_NOWAIT);
- caches++;
- }
- if (KMALLOC_MIN_SIZE <= 64) {
- create_kmalloc_cache(&kmalloc_caches[2],
- "kmalloc-192", 192, GFP_NOWAIT);
- caches++;
- }
+ temp_kmem_cache = kmem_cache;
+ kmem_cache_open(kmem_cache, "kmem_cache", kmem_size,
+ 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ kmem_cache = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
+ memcpy(kmem_cache, temp_kmem_cache, kmem_size);
- for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
- create_kmalloc_cache(&kmalloc_caches[i],
- "kmalloc", 1 << i, GFP_NOWAIT);
- caches++;
- }
+ /*
+ * Allocate kmem_cache_node properly from the kmem_cache slab.
+ * kmem_cache_node is separately allocated so no need to
+ * update any list pointers.
+ */
+ temp_kmem_cache_node = kmem_cache_node;
+
+ kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT);
+ memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size);
+
+ kmem_cache_bootstrap_fixup(kmem_cache_node);
+ caches++;
+ kmem_cache_bootstrap_fixup(kmem_cache);
+ caches++;
+ /* Free temporary boot structure */
+ free_pages((unsigned long)temp_kmem_cache, order);
+
+ /* Now we can use the kmem_cache to allocate kmalloc slabs */
/*
* Patch up the size_index table if we have strange large alignment
@@ -3114,26 +3097,60 @@ void __init kmem_cache_init(void)
size_index[size_index_elem(i)] = 8;
}
+ /* Caches that are not of the two-to-the-power-of size */
+ if (KMALLOC_MIN_SIZE <= 32) {
+ kmalloc_caches[1] = create_kmalloc_cache("kmalloc-96", 96, 0);
+ caches++;
+ }
+
+ if (KMALLOC_MIN_SIZE <= 64) {
+ kmalloc_caches[2] = create_kmalloc_cache("kmalloc-192", 192, 0);
+ caches++;
+ }
+
+ for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
+ kmalloc_caches[i] = create_kmalloc_cache("kmalloc", 1 << i, 0);
+ caches++;
+ }
+
slab_state = UP;
/* Provide the correct kmalloc names now that the caches are up */
+ if (KMALLOC_MIN_SIZE <= 32) {
+ kmalloc_caches[1]->name = kstrdup(kmalloc_caches[1]->name, GFP_NOWAIT);
+ BUG_ON(!kmalloc_caches[1]->name);
+ }
+
+ if (KMALLOC_MIN_SIZE <= 64) {
+ kmalloc_caches[2]->name = kstrdup(kmalloc_caches[2]->name, GFP_NOWAIT);
+ BUG_ON(!kmalloc_caches[2]->name);
+ }
+
for (i = KMALLOC_SHIFT_LOW; i < SLUB_PAGE_SHIFT; i++) {
char *s = kasprintf(GFP_NOWAIT, "kmalloc-%d", 1 << i);
BUG_ON(!s);
- kmalloc_caches[i].name = s;
+ kmalloc_caches[i]->name = s;
}
#ifdef CONFIG_SMP
register_cpu_notifier(&slab_notifier);
#endif
-#ifdef CONFIG_NUMA
- kmem_size = offsetof(struct kmem_cache, node) +
- nr_node_ids * sizeof(struct kmem_cache_node *);
-#else
- kmem_size = sizeof(struct kmem_cache);
-#endif
+#ifdef CONFIG_ZONE_DMA
+ for (i = 0; i < SLUB_PAGE_SHIFT; i++) {
+ struct kmem_cache *s = kmalloc_caches[i];
+
+ if (s && s->size) {
+ char *name = kasprintf(GFP_NOWAIT,
+ "dma-kmalloc-%d", s->objsize);
+
+ BUG_ON(!name);
+ kmalloc_dma_caches[i] = create_kmalloc_cache(name,
+ s->objsize, SLAB_CACHE_DMA);
+ }
+ }
+#endif
printk(KERN_INFO
"SLUB: Genslabs=%d, HWalign=%d, Order=%d-%d, MinObjects=%d,"
" CPUs=%d, Nodes=%d\n",
@@ -3211,6 +3228,7 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
size_t align, unsigned long flags, void (*ctor)(void *))
{
struct kmem_cache *s;
+ char *n;
if (WARN_ON(!name))
return NULL;
@@ -3234,19 +3252,25 @@ struct kmem_cache *kmem_cache_create(const char *name, size_t size,
return s;
}
+ n = kstrdup(name, GFP_KERNEL);
+ if (!n)
+ goto err;
+
s = kmalloc(kmem_size, GFP_KERNEL);
if (s) {
- if (kmem_cache_open(s, GFP_KERNEL, name,
+ if (kmem_cache_open(s, n,
size, align, flags, ctor)) {
list_add(&s->list, &slab_caches);
if (sysfs_slab_add(s)) {
list_del(&s->list);
+ kfree(n);
kfree(s);
goto err;
}
up_write(&slub_lock);
return s;
}
+ kfree(n);
kfree(s);
}
up_write(&slub_lock);
@@ -3318,6 +3342,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
return ret;
}
+#ifdef CONFIG_NUMA
void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
int node, unsigned long caller)
{
@@ -3346,8 +3371,9 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
return ret;
}
+#endif
-#ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_SYSFS
static int count_inuse(struct page *page)
{
return page->inuse;
@@ -3357,7 +3383,9 @@ static int count_total(struct page *page)
{
return page->objects;
}
+#endif
+#ifdef CONFIG_SLUB_DEBUG
static int validate_slab(struct kmem_cache *s, struct page *page,
unsigned long *map)
{
@@ -3448,65 +3476,6 @@ static long validate_slab_cache(struct kmem_cache *s)
kfree(map);
return count;
}
-
-#ifdef SLUB_RESILIENCY_TEST
-static void resiliency_test(void)
-{
- u8 *p;
-
- printk(KERN_ERR "SLUB resiliency testing\n");
- printk(KERN_ERR "-----------------------\n");
- printk(KERN_ERR "A. Corruption after allocation\n");
-
- p = kzalloc(16, GFP_KERNEL);
- p[16] = 0x12;
- printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
- " 0x12->0x%p\n\n", p + 16);
-
- validate_slab_cache(kmalloc_caches + 4);
-
- /* Hmmm... The next two are dangerous */
- p = kzalloc(32, GFP_KERNEL);
- p[32 + sizeof(void *)] = 0x34;
- printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
- " 0x34 -> -0x%p\n", p);
- printk(KERN_ERR
- "If allocated object is overwritten then not detectable\n\n");
-
- validate_slab_cache(kmalloc_caches + 5);
- p = kzalloc(64, GFP_KERNEL);
- p += 64 + (get_cycles() & 0xff) * sizeof(void *);
- *p = 0x56;
- printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
- p);
- printk(KERN_ERR
- "If allocated object is overwritten then not detectable\n\n");
- validate_slab_cache(kmalloc_caches + 6);
-
- printk(KERN_ERR "\nB. Corruption after free\n");
- p = kzalloc(128, GFP_KERNEL);
- kfree(p);
- *p = 0x78;
- printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches + 7);
-
- p = kzalloc(256, GFP_KERNEL);
- kfree(p);
- p[50] = 0x9a;
- printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
- p);
- validate_slab_cache(kmalloc_caches + 8);
-
- p = kzalloc(512, GFP_KERNEL);
- kfree(p);
- p[512] = 0xab;
- printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
- validate_slab_cache(kmalloc_caches + 9);
-}
-#else
-static void resiliency_test(void) {};
-#endif
-
/*
* Generate lists of code addresses where slabcache objects are allocated
* and freed.
@@ -3635,7 +3604,7 @@ static int add_location(struct loc_track *t, struct kmem_cache *s,
static void process_slab(struct loc_track *t, struct kmem_cache *s,
struct page *page, enum track_item alloc,
- long *map)
+ unsigned long *map)
{
void *addr = page_address(page);
void *p;
@@ -3735,7 +3704,71 @@ static int list_locations(struct kmem_cache *s, char *buf,
len += sprintf(buf, "No data\n");
return len;
}
+#endif
+
+#ifdef SLUB_RESILIENCY_TEST
+static void resiliency_test(void)
+{
+ u8 *p;
+ BUILD_BUG_ON(KMALLOC_MIN_SIZE > 16 || SLUB_PAGE_SHIFT < 10);
+
+ printk(KERN_ERR "SLUB resiliency testing\n");
+ printk(KERN_ERR "-----------------------\n");
+ printk(KERN_ERR "A. Corruption after allocation\n");
+
+ p = kzalloc(16, GFP_KERNEL);
+ p[16] = 0x12;
+ printk(KERN_ERR "\n1. kmalloc-16: Clobber Redzone/next pointer"
+ " 0x12->0x%p\n\n", p + 16);
+
+ validate_slab_cache(kmalloc_caches[4]);
+
+ /* Hmmm... The next two are dangerous */
+ p = kzalloc(32, GFP_KERNEL);
+ p[32 + sizeof(void *)] = 0x34;
+ printk(KERN_ERR "\n2. kmalloc-32: Clobber next pointer/next slab"
+ " 0x34 -> -0x%p\n", p);
+ printk(KERN_ERR
+ "If allocated object is overwritten then not detectable\n\n");
+
+ validate_slab_cache(kmalloc_caches[5]);
+ p = kzalloc(64, GFP_KERNEL);
+ p += 64 + (get_cycles() & 0xff) * sizeof(void *);
+ *p = 0x56;
+ printk(KERN_ERR "\n3. kmalloc-64: corrupting random byte 0x56->0x%p\n",
+ p);
+ printk(KERN_ERR
+ "If allocated object is overwritten then not detectable\n\n");
+ validate_slab_cache(kmalloc_caches[6]);
+
+ printk(KERN_ERR "\nB. Corruption after free\n");
+ p = kzalloc(128, GFP_KERNEL);
+ kfree(p);
+ *p = 0x78;
+ printk(KERN_ERR "1. kmalloc-128: Clobber first word 0x78->0x%p\n\n", p);
+ validate_slab_cache(kmalloc_caches[7]);
+
+ p = kzalloc(256, GFP_KERNEL);
+ kfree(p);
+ p[50] = 0x9a;
+ printk(KERN_ERR "\n2. kmalloc-256: Clobber 50th byte 0x9a->0x%p\n\n",
+ p);
+ validate_slab_cache(kmalloc_caches[8]);
+
+ p = kzalloc(512, GFP_KERNEL);
+ kfree(p);
+ p[512] = 0xab;
+ printk(KERN_ERR "\n3. kmalloc-512: Clobber redzone 0xab->0x%p\n\n", p);
+ validate_slab_cache(kmalloc_caches[9]);
+}
+#else
+#ifdef CONFIG_SYSFS
+static void resiliency_test(void) {};
+#endif
+#endif
+
+#ifdef CONFIG_SYSFS
enum slab_stat_type {
SL_ALL, /* All slabs */
SL_PARTIAL, /* Only partially allocated slabs */
@@ -3788,6 +3821,8 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
}
}
+ down_read(&slub_lock);
+#ifdef CONFIG_SLUB_DEBUG
if (flags & SO_ALL) {
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
@@ -3804,7 +3839,9 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
nodes[node] += x;
}
- } else if (flags & SO_PARTIAL) {
+ } else
+#endif
+ if (flags & SO_PARTIAL) {
for_each_node_state(node, N_NORMAL_MEMORY) {
struct kmem_cache_node *n = get_node(s, node);
@@ -3829,6 +3866,7 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
return x + sprintf(buf + x, "\n");
}
+#ifdef CONFIG_SLUB_DEBUG
static int any_slab_objects(struct kmem_cache *s)
{
int node;
@@ -3844,6 +3882,7 @@ static int any_slab_objects(struct kmem_cache *s)
}
return 0;
}
+#endif
#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
#define to_slab(n) container_of(n, struct kmem_cache, kobj);
@@ -3945,12 +3984,6 @@ static ssize_t aliases_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(aliases);
-static ssize_t slabs_show(struct kmem_cache *s, char *buf)
-{
- return show_slab_objects(s, buf, SO_ALL);
-}
-SLAB_ATTR_RO(slabs);
-
static ssize_t partial_show(struct kmem_cache *s, char *buf)
{
return show_slab_objects(s, buf, SO_PARTIAL);
@@ -3975,93 +4008,83 @@ static ssize_t objects_partial_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(objects_partial);
-static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
-{
- return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
-}
-SLAB_ATTR_RO(total_objects);
-
-static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
+static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
}
-static ssize_t sanity_checks_store(struct kmem_cache *s,
+static ssize_t reclaim_account_store(struct kmem_cache *s,
const char *buf, size_t length)
{
- s->flags &= ~SLAB_DEBUG_FREE;
+ s->flags &= ~SLAB_RECLAIM_ACCOUNT;
if (buf[0] == '1')
- s->flags |= SLAB_DEBUG_FREE;
+ s->flags |= SLAB_RECLAIM_ACCOUNT;
return length;
}
-SLAB_ATTR(sanity_checks);
+SLAB_ATTR(reclaim_account);
-static ssize_t trace_show(struct kmem_cache *s, char *buf)
+static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
}
+SLAB_ATTR_RO(hwcache_align);
-static ssize_t trace_store(struct kmem_cache *s, const char *buf,
- size_t length)
+#ifdef CONFIG_ZONE_DMA
+static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
{
- s->flags &= ~SLAB_TRACE;
- if (buf[0] == '1')
- s->flags |= SLAB_TRACE;
- return length;
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
}
-SLAB_ATTR(trace);
+SLAB_ATTR_RO(cache_dma);
+#endif
-#ifdef CONFIG_FAILSLAB
-static ssize_t failslab_show(struct kmem_cache *s, char *buf)
+static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
}
+SLAB_ATTR_RO(destroy_by_rcu);
-static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
- size_t length)
+#ifdef CONFIG_SLUB_DEBUG
+static ssize_t slabs_show(struct kmem_cache *s, char *buf)
{
- s->flags &= ~SLAB_FAILSLAB;
- if (buf[0] == '1')
- s->flags |= SLAB_FAILSLAB;
- return length;
+ return show_slab_objects(s, buf, SO_ALL);
}
-SLAB_ATTR(failslab);
-#endif
+SLAB_ATTR_RO(slabs);
-static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
+static ssize_t total_objects_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
+ return show_slab_objects(s, buf, SO_ALL|SO_TOTAL);
}
+SLAB_ATTR_RO(total_objects);
-static ssize_t reclaim_account_store(struct kmem_cache *s,
- const char *buf, size_t length)
+static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
{
- s->flags &= ~SLAB_RECLAIM_ACCOUNT;
- if (buf[0] == '1')
- s->flags |= SLAB_RECLAIM_ACCOUNT;
- return length;
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_DEBUG_FREE));
}
-SLAB_ATTR(reclaim_account);
-static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
+static ssize_t sanity_checks_store(struct kmem_cache *s,
+ const char *buf, size_t length)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
+ s->flags &= ~SLAB_DEBUG_FREE;
+ if (buf[0] == '1')
+ s->flags |= SLAB_DEBUG_FREE;
+ return length;
}
-SLAB_ATTR_RO(hwcache_align);
+SLAB_ATTR(sanity_checks);
-#ifdef CONFIG_ZONE_DMA
-static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
+static ssize_t trace_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
}
-SLAB_ATTR_RO(cache_dma);
-#endif
-static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
+static ssize_t trace_store(struct kmem_cache *s, const char *buf,
+ size_t length)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_DESTROY_BY_RCU));
+ s->flags &= ~SLAB_TRACE;
+ if (buf[0] == '1')
+ s->flags |= SLAB_TRACE;
+ return length;
}
-SLAB_ATTR_RO(destroy_by_rcu);
+SLAB_ATTR(trace);
static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
{
@@ -4139,6 +4162,40 @@ static ssize_t validate_store(struct kmem_cache *s,
}
SLAB_ATTR(validate);
+static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return -ENOSYS;
+ return list_locations(s, buf, TRACK_ALLOC);
+}
+SLAB_ATTR_RO(alloc_calls);
+
+static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
+{
+ if (!(s->flags & SLAB_STORE_USER))
+ return -ENOSYS;
+ return list_locations(s, buf, TRACK_FREE);
+}
+SLAB_ATTR_RO(free_calls);
+#endif /* CONFIG_SLUB_DEBUG */
+
+#ifdef CONFIG_FAILSLAB
+static ssize_t failslab_show(struct kmem_cache *s, char *buf)
+{
+ return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
+}
+
+static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
+ size_t length)
+{
+ s->flags &= ~SLAB_FAILSLAB;
+ if (buf[0] == '1')
+ s->flags |= SLAB_FAILSLAB;
+ return length;
+}
+SLAB_ATTR(failslab);
+#endif
+
static ssize_t shrink_show(struct kmem_cache *s, char *buf)
{
return 0;
@@ -4158,22 +4215,6 @@ static ssize_t shrink_store(struct kmem_cache *s,
}
SLAB_ATTR(shrink);
-static ssize_t alloc_calls_show(struct kmem_cache *s, char *buf)
-{
- if (!(s->flags & SLAB_STORE_USER))
- return -ENOSYS;
- return list_locations(s, buf, TRACK_ALLOC);
-}
-SLAB_ATTR_RO(alloc_calls);
-
-static ssize_t free_calls_show(struct kmem_cache *s, char *buf)
-{
- if (!(s->flags & SLAB_STORE_USER))
- return -ENOSYS;
- return list_locations(s, buf, TRACK_FREE);
-}
-SLAB_ATTR_RO(free_calls);
-
#ifdef CONFIG_NUMA
static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
{
@@ -4279,25 +4320,27 @@ static struct attribute *slab_attrs[] = {
&min_partial_attr.attr,
&objects_attr.attr,
&objects_partial_attr.attr,
- &total_objects_attr.attr,
- &slabs_attr.attr,
&partial_attr.attr,
&cpu_slabs_attr.attr,
&ctor_attr.attr,
&aliases_attr.attr,
&align_attr.attr,
- &sanity_checks_attr.attr,
- &trace_attr.attr,
&hwcache_align_attr.attr,
&reclaim_account_attr.attr,
&destroy_by_rcu_attr.attr,
+ &shrink_attr.attr,
+#ifdef CONFIG_SLUB_DEBUG
+ &total_objects_attr.attr,
+ &slabs_attr.attr,
+ &sanity_checks_attr.attr,
+ &trace_attr.attr,
&red_zone_attr.attr,
&poison_attr.attr,
&store_user_attr.attr,
&validate_attr.attr,
- &shrink_attr.attr,
&alloc_calls_attr.attr,
&free_calls_attr.attr,
+#endif
#ifdef CONFIG_ZONE_DMA
&cache_dma_attr.attr,
#endif
@@ -4377,6 +4420,7 @@ static void kmem_cache_release(struct kobject *kobj)
{
struct kmem_cache *s = to_slab(kobj);
+ kfree(s->name);
kfree(s);
}
@@ -4579,7 +4623,7 @@ static int __init slab_sysfs_init(void)
}
__initcall(slab_sysfs_init);
-#endif
+#endif /* CONFIG_SYSFS */
/*
* The /proc/slabinfo ABI
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index aa33fd6..29d6cbf 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -220,18 +220,7 @@ void __init sparse_mem_maps_populate_node(struct page **map_map,
if (vmemmap_buf_start) {
/* need to free left buf */
-#ifdef CONFIG_NO_BOOTMEM
- free_early(__pa(vmemmap_buf_start), __pa(vmemmap_buf_end));
- if (vmemmap_buf_start < vmemmap_buf) {
- char name[15];
-
- snprintf(name, sizeof(name), "MEMMAP %d", nodeid);
- reserve_early_without_check(__pa(vmemmap_buf_start),
- __pa(vmemmap_buf), name);
- }
-#else
free_bootmem(__pa(vmemmap_buf), vmemmap_buf_end - vmemmap_buf);
-#endif
vmemmap_buf = NULL;
vmemmap_buf_end = NULL;
}
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 1f3f9c5..9fc7bac 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -47,8 +47,6 @@ long nr_swap_pages;
long total_swap_pages;
static int least_priority;
-static bool swap_for_hibernation;
-
static const char Bad_file[] = "Bad swap file entry ";
static const char Unused_file[] = "Unused swap file entry ";
static const char Bad_offset[] = "Bad swap offset entry ";
@@ -141,8 +139,7 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = ((sector_t)se->nr_pages - 1) << (PAGE_SHIFT - 9);
if (nr_blocks) {
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL,
- BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+ nr_blocks, GFP_KERNEL, 0);
if (err)
return err;
cond_resched();
@@ -153,8 +150,7 @@ static int discard_swap(struct swap_info_struct *si)
nr_blocks = (sector_t)se->nr_pages << (PAGE_SHIFT - 9);
err = blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_KERNEL,
- BLKDEV_IFL_WAIT | BLKDEV_IFL_BARRIER);
+ nr_blocks, GFP_KERNEL, 0);
if (err)
break;
@@ -193,8 +189,7 @@ static void discard_swap_cluster(struct swap_info_struct *si,
start_block <<= PAGE_SHIFT - 9;
nr_blocks <<= PAGE_SHIFT - 9;
if (blkdev_issue_discard(si->bdev, start_block,
- nr_blocks, GFP_NOIO, BLKDEV_IFL_WAIT |
- BLKDEV_IFL_BARRIER))
+ nr_blocks, GFP_NOIO, 0))
break;
}
@@ -320,10 +315,8 @@ checks:
if (offset > si->highest_bit)
scan_base = offset = si->lowest_bit;
- /* reuse swap entry of cache-only swap if not hibernation. */
- if (vm_swap_full()
- && usage == SWAP_HAS_CACHE
- && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ /* reuse swap entry of cache-only swap if not busy. */
+ if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
int swap_was_freed;
spin_unlock(&swap_lock);
swap_was_freed = __try_to_reclaim_swap(si, offset);
@@ -453,8 +446,6 @@ swp_entry_t get_swap_page(void)
spin_lock(&swap_lock);
if (nr_swap_pages <= 0)
goto noswap;
- if (swap_for_hibernation)
- goto noswap;
nr_swap_pages--;
for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
@@ -487,6 +478,28 @@ noswap:
return (swp_entry_t) {0};
}
+/* The only caller of this function is now susupend routine */
+swp_entry_t get_swap_page_of_type(int type)
+{
+ struct swap_info_struct *si;
+ pgoff_t offset;
+
+ spin_lock(&swap_lock);
+ si = swap_info[type];
+ if (si && (si->flags & SWP_WRITEOK)) {
+ nr_swap_pages--;
+ /* This is called for allocating swap entry, not cache */
+ offset = scan_swap_map(si, 1);
+ if (offset) {
+ spin_unlock(&swap_lock);
+ return swp_entry(type, offset);
+ }
+ nr_swap_pages++;
+ }
+ spin_unlock(&swap_lock);
+ return (swp_entry_t) {0};
+}
+
static struct swap_info_struct *swap_info_get(swp_entry_t entry)
{
struct swap_info_struct *p;
@@ -670,6 +683,24 @@ int try_to_free_swap(struct page *page)
if (page_swapcount(page))
return 0;
+ /*
+ * Once hibernation has begun to create its image of memory,
+ * there's a danger that one of the calls to try_to_free_swap()
+ * - most probably a call from __try_to_reclaim_swap() while
+ * hibernation is allocating its own swap pages for the image,
+ * but conceivably even a call from memory reclaim - will free
+ * the swap from a page which has already been recorded in the
+ * image as a clean swapcache page, and then reuse its swap for
+ * another page of the image. On waking from hibernation, the
+ * original page might be freed under memory pressure, then
+ * later read back in from swap, now with the wrong data.
+ *
+ * Hibernation clears bits from gfp_allowed_mask to prevent
+ * memory reclaim from writing to disk, so check that here.
+ */
+ if (!(gfp_allowed_mask & __GFP_IO))
+ return 0;
+
delete_from_swap_cache(page);
SetPageDirty(page);
return 1;
@@ -746,74 +777,6 @@ int mem_cgroup_count_swap_user(swp_entry_t ent, struct page **pagep)
#endif
#ifdef CONFIG_HIBERNATION
-
-static pgoff_t hibernation_offset[MAX_SWAPFILES];
-/*
- * Once hibernation starts to use swap, we freeze swap_map[]. Otherwise,
- * saved swap_map[] image to the disk will be an incomplete because it's
- * changing without synchronization with hibernation snap shot.
- * At resume, we just make swap_for_hibernation=false. We can forget
- * used maps easily.
- */
-void hibernation_freeze_swap(void)
-{
- int i;
-
- spin_lock(&swap_lock);
-
- printk(KERN_INFO "PM: Freeze Swap\n");
- swap_for_hibernation = true;
- for (i = 0; i < MAX_SWAPFILES; i++)
- hibernation_offset[i] = 1;
- spin_unlock(&swap_lock);
-}
-
-void hibernation_thaw_swap(void)
-{
- spin_lock(&swap_lock);
- if (swap_for_hibernation) {
- printk(KERN_INFO "PM: Thaw Swap\n");
- swap_for_hibernation = false;
- }
- spin_unlock(&swap_lock);
-}
-
-/*
- * Because updateing swap_map[] can make not-saved-status-change,
- * we use our own easy allocator.
- * Please see kernel/power/swap.c, Used swaps are recorded into
- * RB-tree.
- */
-swp_entry_t get_swap_for_hibernation(int type)
-{
- pgoff_t off;
- swp_entry_t val = {0};
- struct swap_info_struct *si;
-
- spin_lock(&swap_lock);
-
- si = swap_info[type];
- if (!si || !(si->flags & SWP_WRITEOK))
- goto done;
-
- for (off = hibernation_offset[type]; off < si->max; ++off) {
- if (!si->swap_map[off])
- break;
- }
- if (off < si->max) {
- val = swp_entry(type, off);
- hibernation_offset[type] = off + 1;
- }
-done:
- spin_unlock(&swap_lock);
- return val;
-}
-
-void swap_free_for_hibernation(swp_entry_t ent)
-{
- /* Nothing to do */
-}
-
/*
* Find the swap type that corresponds to given device (if any).
*
@@ -2084,7 +2047,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
p->flags |= SWP_SOLIDSTATE;
p->cluster_next = 1 + (random32() % p->highest_bit);
}
- if (discard_swap(p) == 0)
+ if (discard_swap(p) == 0 && (swap_flags & SWAP_FLAG_DISCARD))
p->flags |= SWP_DISCARDABLE;
}
diff --git a/mm/util.c b/mm/util.c
index 4735ea4..73dac81 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -245,6 +245,19 @@ void arch_pick_mmap_layout(struct mm_struct *mm)
}
#endif
+/*
+ * Like get_user_pages_fast() except its IRQ-safe in that it won't fall
+ * back to the regular GUP.
+ * If the architecture not support this fucntion, simply return with no
+ * page pinned
+ */
+int __attribute__((weak)) __get_user_pages_fast(unsigned long start,
+ int nr_pages, int write, struct page **pages)
+{
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__get_user_pages_fast);
+
/**
* get_user_pages_fast() - pin user pages in memory
* @start: starting user address
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 918c513..9f90962 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -31,6 +31,7 @@
#include <asm/tlbflush.h>
#include <asm/shmparam.h>
+bool vmap_lazy_unmap __read_mostly = true;
/*** Page table manipulation functions ***/
@@ -502,6 +503,9 @@ static unsigned long lazy_max_pages(void)
{
unsigned int log;
+ if (!vmap_lazy_unmap)
+ return 0;
+
log = fls(num_online_cpus());
return log * (32UL * 1024 * 1024 / PAGE_SIZE);
@@ -513,6 +517,15 @@ static atomic_t vmap_lazy_nr = ATOMIC_INIT(0);
static void purge_fragmented_blocks_allcpus(void);
/*
+ * called before a call to iounmap() if the caller wants vm_area_struct's
+ * immediately freed.
+ */
+void set_iounmap_nonlazy(void)
+{
+ atomic_set(&vmap_lazy_nr, lazy_max_pages()+1);
+}
+
+/*
* Purges all lazily-freed vmap areas.
*
* If sync is 0 then don't purge if there is already a purge in progress.
@@ -2052,6 +2065,7 @@ void free_vm_area(struct vm_struct *area)
}
EXPORT_SYMBOL_GPL(free_vm_area);
+#ifdef CONFIG_SMP
static struct vmap_area *node_to_va(struct rb_node *n)
{
return n ? rb_entry(n, struct vmap_area, rb_node) : NULL;
@@ -2332,6 +2346,7 @@ void pcpu_free_vm_areas(struct vm_struct **vms, int nr_vms)
free_vm_area(vms[i]);
kfree(vms);
}
+#endif /* CONFIG_SMP */
#ifdef CONFIG_PROC_FS
static void *s_start(struct seq_file *m, loff_t *pos)
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 18fa3d7..b94c946 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1804,12 +1804,11 @@ static void shrink_zone(int priority, struct zone *zone,
* If a zone is deemed to be full of pinned pages then just give it a light
* scan then give up on it.
*/
-static bool shrink_zones(int priority, struct zonelist *zonelist,
+static void shrink_zones(int priority, struct zonelist *zonelist,
struct scan_control *sc)
{
struct zoneref *z;
struct zone *zone;
- bool all_unreclaimable = true;
for_each_zone_zonelist_nodemask(zone, z, zonelist,
gfp_zone(sc->gfp_mask), sc->nodemask) {
@@ -1827,8 +1826,38 @@ static bool shrink_zones(int priority, struct zonelist *zonelist,
}
shrink_zone(priority, zone, sc);
- all_unreclaimable = false;
}
+}
+
+static bool zone_reclaimable(struct zone *zone)
+{
+ return zone->pages_scanned < zone_reclaimable_pages(zone) * 6;
+}
+
+/*
+ * As hibernation is going on, kswapd is freezed so that it can't mark
+ * the zone into all_unreclaimable. It can't handle OOM during hibernation.
+ * So let's check zone's unreclaimable in direct reclaim as well as kswapd.
+ */
+static bool all_unreclaimable(struct zonelist *zonelist,
+ struct scan_control *sc)
+{
+ struct zoneref *z;
+ struct zone *zone;
+ bool all_unreclaimable = true;
+
+ for_each_zone_zonelist_nodemask(zone, z, zonelist,
+ gfp_zone(sc->gfp_mask), sc->nodemask) {
+ if (!populated_zone(zone))
+ continue;
+ if (!cpuset_zone_allowed_hardwall(zone, GFP_KERNEL))
+ continue;
+ if (zone_reclaimable(zone)) {
+ all_unreclaimable = false;
+ break;
+ }
+ }
+
return all_unreclaimable;
}
@@ -1852,7 +1881,6 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
struct scan_control *sc)
{
int priority;
- bool all_unreclaimable;
unsigned long total_scanned = 0;
struct reclaim_state *reclaim_state = current->reclaim_state;
struct zoneref *z;
@@ -1869,7 +1897,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
sc->nr_scanned = 0;
if (!priority)
disable_swap_token();
- all_unreclaimable = shrink_zones(priority, zonelist, sc);
+ shrink_zones(priority, zonelist, sc);
/*
* Don't shrink slabs when reclaiming memory from
* over limit cgroups
@@ -1931,7 +1959,7 @@ out:
return sc->nr_reclaimed;
/* top priority shrink_zones still had more to do? don't OOM, then */
- if (scanning_global_lru(sc) && !all_unreclaimable)
+ if (scanning_global_lru(sc) && !all_unreclaimable(zonelist, sc))
return 1;
return 0;
@@ -1969,9 +1997,10 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap,
unsigned int swappiness,
- struct zone *zone, int nid)
+ struct zone *zone)
{
struct scan_control sc = {
+ .nr_to_reclaim = SWAP_CLUSTER_MAX,
.may_writepage = !laptop_mode,
.may_unmap = 1,
.may_swap = !noswap,
@@ -1979,13 +2008,8 @@ unsigned long mem_cgroup_shrink_node_zone(struct mem_cgroup *mem,
.order = 0,
.mem_cgroup = mem,
};
- nodemask_t nm = nodemask_of_node(nid);
-
sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) |
(GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK);
- sc.nodemask = &nm;
- sc.nr_reclaimed = 0;
- sc.nr_scanned = 0;
trace_mm_vmscan_memcg_softlimit_reclaim_begin(0,
sc.may_writepage,
@@ -2172,7 +2196,6 @@ loop_again:
for (i = 0; i <= end_zone; i++) {
struct zone *zone = pgdat->node_zones + i;
int nr_slab;
- int nid, zid;
if (!populated_zone(zone))
continue;
@@ -2182,14 +2205,12 @@ loop_again:
sc.nr_scanned = 0;
- nid = pgdat->node_id;
- zid = zone_idx(zone);
/*
* Call soft limit reclaim before calling shrink_zone.
* For now we ignore the return value
*/
- mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask,
- nid, zid);
+ mem_cgroup_soft_limit_reclaim(zone, order, sc.gfp_mask);
+
/*
* We put equal pressure on every zone, unless one
* zone has way too many pages free already.
@@ -2204,8 +2225,7 @@ loop_again:
total_scanned += sc.nr_scanned;
if (zone->all_unreclaimable)
continue;
- if (nr_slab == 0 &&
- zone->pages_scanned >= (zone_reclaimable_pages(zone) * 6))
+ if (nr_slab == 0 && !zone_reclaimable(zone))
zone->all_unreclaimable = 1;
/*
* If we've done a decent amount of scanning and
diff --git a/mm/vmstat.c b/mm/vmstat.c
index f389168..355a9e6 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -138,11 +138,24 @@ static void refresh_zone_stat_thresholds(void)
int threshold;
for_each_populated_zone(zone) {
+ unsigned long max_drift, tolerate_drift;
+
threshold = calculate_threshold(zone);
for_each_online_cpu(cpu)
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
= threshold;
+
+ /*
+ * Only set percpu_drift_mark if there is a danger that
+ * NR_FREE_PAGES reports the low watermark is ok when in fact
+ * the min watermark could be breached by an allocation
+ */
+ tolerate_drift = low_wmark_pages(zone) - min_wmark_pages(zone);
+ max_drift = num_online_cpus() * threshold;
+ if (max_drift > tolerate_drift)
+ zone->percpu_drift_mark = high_wmark_pages(zone) +
+ max_drift;
}
}
@@ -813,7 +826,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
"\n scanned %lu"
"\n spanned %lu"
"\n present %lu",
- zone_page_state(zone, NR_FREE_PAGES),
+ zone_nr_free_pages(zone),
min_wmark_pages(zone),
low_wmark_pages(zone),
high_wmark_pages(zone),
@@ -998,6 +1011,7 @@ static int __cpuinit vmstat_cpuup_callback(struct notifier_block *nfb,
switch (action) {
case CPU_ONLINE:
case CPU_ONLINE_FROZEN:
+ refresh_zone_stat_thresholds();
start_cpu_timer(cpu);
node_set_state(cpu_to_node(cpu), N_CPU);
break;
OpenPOWER on IntegriCloud