summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/filemap.c6
-rw-r--r--mm/internal.h5
-rw-r--r--mm/memblock.c2
-rw-r--r--mm/memcontrol.c266
-rw-r--r--mm/memory.c2
-rw-r--r--mm/memory_hotplug.c10
-rw-r--r--mm/mmap.c10
-rw-r--r--mm/mremap.c18
-rw-r--r--mm/page_alloc.c90
-rw-r--r--mm/rmap.c2
-rw-r--r--mm/sparse.c2
-rw-r--r--mm/vmalloc.c63
-rw-r--r--mm/vmscan.c24
13 files changed, 232 insertions, 268 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 7905fe7..4b51ac1 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -1539,12 +1539,12 @@ static void do_sync_mmap_readahead(struct vm_area_struct *vma,
struct address_space *mapping = file->f_mapping;
/* If we don't want any read-ahead, don't bother */
- if (VM_RandomReadHint(vma))
+ if (vma->vm_flags & VM_RAND_READ)
return;
if (!ra->ra_pages)
return;
- if (VM_SequentialReadHint(vma)) {
+ if (vma->vm_flags & VM_SEQ_READ) {
page_cache_sync_readahead(mapping, ra, file, offset,
ra->ra_pages);
return;
@@ -1584,7 +1584,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma,
struct address_space *mapping = file->f_mapping;
/* If we don't want any read-ahead, don't bother */
- if (VM_RandomReadHint(vma))
+ if (vma->vm_flags & VM_RAND_READ)
return;
if (ra->mmap_miss > 0)
ra->mmap_miss--;
diff --git a/mm/internal.h b/mm/internal.h
index 8562de0..4390ac6 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -32,11 +32,6 @@ static inline void set_page_refcounted(struct page *page)
set_page_count(page, 1);
}
-static inline void __put_page(struct page *page)
-{
- atomic_dec(&page->_count);
-}
-
static inline void __get_page_tail_foll(struct page *page,
bool get_page_head)
{
diff --git a/mm/memblock.c b/mm/memblock.c
index c5fad93..a847bfe6 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -566,7 +566,7 @@ int __init_memblock memblock_reserve(phys_addr_t base, phys_addr_t size)
/**
* __next_free_mem_range - next function for for_each_free_mem_range()
* @idx: pointer to u64 loop variable
- * @nid: nid: node selector, %MAX_NUMNODES for all nodes
+ * @nid: node selector, %MAX_NUMNODES for all nodes
* @out_start: ptr to phys_addr_t for start address of the range, can be %NULL
* @out_end: ptr to phys_addr_t for end address of the range, can be %NULL
* @out_nid: ptr to int for nid of the range, can be %NULL
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 2e851f4..d12ca6f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -187,10 +187,6 @@ struct mem_cgroup_per_node {
struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES];
};
-struct mem_cgroup_lru_info {
- struct mem_cgroup_per_node *nodeinfo[0];
-};
-
/*
* Cgroups above their limits are maintained in a RB-Tree, independent of
* their hierarchy representation
@@ -267,28 +263,10 @@ struct mem_cgroup {
/* vmpressure notifications */
struct vmpressure vmpressure;
- union {
- /*
- * the counter to account for mem+swap usage.
- */
- struct res_counter memsw;
-
- /*
- * rcu_freeing is used only when freeing struct mem_cgroup,
- * so put it into a union to avoid wasting more memory.
- * It must be disjoint from the css field. It could be
- * in a union with the res field, but res plays a much
- * larger part in mem_cgroup life than memsw, and might
- * be of interest, even at time of free, when debugging.
- * So share rcu_head with the less interesting memsw.
- */
- struct rcu_head rcu_freeing;
- /*
- * We also need some space for a worker in deferred freeing.
- * By the time we call it, rcu_freeing is no longer in use.
- */
- struct work_struct work_freeing;
- };
+ /*
+ * the counter to account for mem+swap usage.
+ */
+ struct res_counter memsw;
/*
* the counter to account for kernel memory usage.
@@ -303,8 +281,6 @@ struct mem_cgroup {
bool oom_lock;
atomic_t under_oom;
- atomic_t refcnt;
-
int swappiness;
/* OOM-Killer disable */
int oom_kill_disable;
@@ -366,14 +342,8 @@ struct mem_cgroup {
atomic_t numainfo_updating;
#endif
- /*
- * Per cgroup active and inactive list, similar to the
- * per zone LRU lists.
- *
- * WARNING: This has to be the last element of the struct. Don't
- * add new fields after this point.
- */
- struct mem_cgroup_lru_info info;
+ struct mem_cgroup_per_node *nodeinfo[0];
+ /* WARNING: nodeinfo must be the last member here */
};
static size_t memcg_size(void)
@@ -416,6 +386,11 @@ static void memcg_kmem_clear_activated(struct mem_cgroup *memcg)
static void memcg_kmem_mark_dead(struct mem_cgroup *memcg)
{
+ /*
+ * Our caller must use css_get() first, because memcg_uncharge_kmem()
+ * will call css_put() if it sees the memcg is dead.
+ */
+ smp_wmb();
if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags))
set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags);
}
@@ -508,9 +483,6 @@ enum res_type {
*/
static DEFINE_MUTEX(memcg_create_mutex);
-static void mem_cgroup_get(struct mem_cgroup *memcg);
-static void mem_cgroup_put(struct mem_cgroup *memcg);
-
static inline
struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s)
{
@@ -561,15 +533,15 @@ void sock_update_memcg(struct sock *sk)
*/
if (sk->sk_cgrp) {
BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg));
- mem_cgroup_get(sk->sk_cgrp->memcg);
+ css_get(&sk->sk_cgrp->memcg->css);
return;
}
rcu_read_lock();
memcg = mem_cgroup_from_task(current);
cg_proto = sk->sk_prot->proto_cgroup(memcg);
- if (!mem_cgroup_is_root(memcg) && memcg_proto_active(cg_proto)) {
- mem_cgroup_get(memcg);
+ if (!mem_cgroup_is_root(memcg) &&
+ memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) {
sk->sk_cgrp = cg_proto;
}
rcu_read_unlock();
@@ -583,7 +555,7 @@ void sock_release_memcg(struct sock *sk)
struct mem_cgroup *memcg;
WARN_ON(!sk->sk_cgrp->memcg);
memcg = sk->sk_cgrp->memcg;
- mem_cgroup_put(memcg);
+ css_put(&sk->sk_cgrp->memcg->css);
}
}
@@ -683,7 +655,7 @@ static struct mem_cgroup_per_zone *
mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid)
{
VM_BUG_ON((unsigned)nid >= nr_node_ids);
- return &memcg->info.nodeinfo[nid]->zoneinfo[zid];
+ return &memcg->nodeinfo[nid]->zoneinfo[zid];
}
struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg)
@@ -3060,8 +3032,16 @@ static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size)
if (res_counter_uncharge(&memcg->kmem, size))
return;
+ /*
+ * Releases a reference taken in kmem_cgroup_css_offline in case
+ * this last uncharge is racing with the offlining code or it is
+ * outliving the memcg existence.
+ *
+ * The memory barrier imposed by test&clear is paired with the
+ * explicit one in memcg_kmem_mark_dead().
+ */
if (memcg_kmem_test_and_clear_dead(memcg))
- mem_cgroup_put(memcg);
+ css_put(&memcg->css);
}
void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep)
@@ -3252,7 +3232,7 @@ void memcg_release_cache(struct kmem_cache *s)
list_del(&s->memcg_params->list);
mutex_unlock(&memcg->slab_caches_mutex);
- mem_cgroup_put(memcg);
+ css_put(&memcg->css);
out:
kfree(s->memcg_params);
}
@@ -3412,16 +3392,18 @@ static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg,
mutex_lock(&memcg_cache_mutex);
new_cachep = cachep->memcg_params->memcg_caches[idx];
- if (new_cachep)
+ if (new_cachep) {
+ css_put(&memcg->css);
goto out;
+ }
new_cachep = kmem_cache_dup(memcg, cachep);
if (new_cachep == NULL) {
new_cachep = cachep;
+ css_put(&memcg->css);
goto out;
}
- mem_cgroup_get(memcg);
atomic_set(&new_cachep->memcg_params->nr_pages , 0);
cachep->memcg_params->memcg_caches[idx] = new_cachep;
@@ -3509,8 +3491,6 @@ static void memcg_create_cache_work_func(struct work_struct *w)
cw = container_of(w, struct create_work, work);
memcg_create_kmem_cache(cw->memcg, cw->cachep);
- /* Drop the reference gotten when we enqueued. */
- css_put(&cw->memcg->css);
kfree(cw);
}
@@ -3647,6 +3627,34 @@ __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order)
int ret;
*_memcg = NULL;
+
+ /*
+ * Disabling accounting is only relevant for some specific memcg
+ * internal allocations. Therefore we would initially not have such
+ * check here, since direct calls to the page allocator that are marked
+ * with GFP_KMEMCG only happen outside memcg core. We are mostly
+ * concerned with cache allocations, and by having this test at
+ * memcg_kmem_get_cache, we are already able to relay the allocation to
+ * the root cache and bypass the memcg cache altogether.
+ *
+ * There is one exception, though: the SLUB allocator does not create
+ * large order caches, but rather service large kmallocs directly from
+ * the page allocator. Therefore, the following sequence when backed by
+ * the SLUB allocator:
+ *
+ * memcg_stop_kmem_account();
+ * kmalloc(<large_number>)
+ * memcg_resume_kmem_account();
+ *
+ * would effectively ignore the fact that we should skip accounting,
+ * since it will drive us directly to this function without passing
+ * through the cache selector memcg_kmem_get_cache. Such large
+ * allocations are extremely rare but can happen, for instance, for the
+ * cache arrays. We bring this test here.
+ */
+ if (!current->mm || current->memcg_kmem_skip_account)
+ return true;
+
memcg = try_get_mem_cgroup_from_mm(current->mm);
/*
@@ -4200,12 +4208,12 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype,
unlock_page_cgroup(pc);
/*
* even after unlock, we have memcg->res.usage here and this memcg
- * will never be freed.
+ * will never be freed, so it's safe to call css_get().
*/
memcg_check_events(memcg, page);
if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) {
mem_cgroup_swap_statistics(memcg, true);
- mem_cgroup_get(memcg);
+ css_get(&memcg->css);
}
/*
* Migration does not charge the res_counter for the
@@ -4317,7 +4325,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout)
/*
* record memcg information, if swapout && memcg != NULL,
- * mem_cgroup_get() was called in uncharge().
+ * css_get() was called in uncharge().
*/
if (do_swap_account && swapout && memcg)
swap_cgroup_record(ent, css_id(&memcg->css));
@@ -4348,7 +4356,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
if (!mem_cgroup_is_root(memcg))
res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
mem_cgroup_swap_statistics(memcg, false);
- mem_cgroup_put(memcg);
+ css_put(&memcg->css);
}
rcu_read_unlock();
}
@@ -4382,11 +4390,14 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry,
* This function is only called from task migration context now.
* It postpones res_counter and refcount handling till the end
* of task migration(mem_cgroup_clear_mc()) for performance
- * improvement. But we cannot postpone mem_cgroup_get(to)
- * because if the process that has been moved to @to does
- * swap-in, the refcount of @to might be decreased to 0.
+ * improvement. But we cannot postpone css_get(to) because if
+ * the process that has been moved to @to does swap-in, the
+ * refcount of @to might be decreased to 0.
+ *
+ * We are in attach() phase, so the cgroup is guaranteed to be
+ * alive, so we can just call css_get().
*/
- mem_cgroup_get(to);
+ css_get(&to->css);
return 0;
}
return -EINVAL;
@@ -5165,14 +5176,6 @@ static int memcg_update_kmem_limit(struct cgroup *cont, u64 val)
* starts accounting before all call sites are patched
*/
memcg_kmem_set_active(memcg);
-
- /*
- * kmem charges can outlive the cgroup. In the case of slab
- * pages, for instance, a page contain objects from various
- * processes, so it is unfeasible to migrate them away. We
- * need to reference count the memcg because of that.
- */
- mem_cgroup_get(memcg);
} else
ret = res_counter_set_limit(&memcg->kmem, val);
out:
@@ -5205,16 +5208,16 @@ static int memcg_propagate_kmem(struct mem_cgroup *memcg)
goto out;
/*
- * destroy(), called if we fail, will issue static_key_slow_inc() and
- * mem_cgroup_put() if kmem is enabled. We have to either call them
- * unconditionally, or clear the KMEM_ACTIVE flag. I personally find
- * this more consistent, since it always leads to the same destroy path
+ * __mem_cgroup_free() will issue static_key_slow_dec() because this
+ * memcg is active already. If the later initialization fails then the
+ * cgroup core triggers the cleanup so we do not have to do it here.
*/
- mem_cgroup_get(memcg);
static_key_slow_inc(&memcg_kmem_enabled_key);
mutex_lock(&set_limit_mutex);
+ memcg_stop_kmem_account();
ret = memcg_update_cache_sizes(memcg);
+ memcg_resume_kmem_account();
mutex_unlock(&set_limit_mutex);
out:
return ret;
@@ -5893,23 +5896,43 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
return mem_cgroup_sockets_init(memcg, ss);
}
-static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
+static void memcg_destroy_kmem(struct mem_cgroup *memcg)
{
mem_cgroup_sockets_destroy(memcg);
+}
+
+static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
+{
+ if (!memcg_kmem_is_active(memcg))
+ return;
+
+ /*
+ * kmem charges can outlive the cgroup. In the case of slab
+ * pages, for instance, a page contain objects from various
+ * processes. As we prevent from taking a reference for every
+ * such allocation we have to be careful when doing uncharge
+ * (see memcg_uncharge_kmem) and here during offlining.
+ *
+ * The idea is that that only the _last_ uncharge which sees
+ * the dead memcg will drop the last reference. An additional
+ * reference is taken here before the group is marked dead
+ * which is then paired with css_put during uncharge resp. here.
+ *
+ * Although this might sound strange as this path is called from
+ * css_offline() when the referencemight have dropped down to 0
+ * and shouldn't be incremented anymore (css_tryget would fail)
+ * we do not have other options because of the kmem allocations
+ * lifetime.
+ */
+ css_get(&memcg->css);
memcg_kmem_mark_dead(memcg);
if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0)
return;
- /*
- * Charges already down to 0, undo mem_cgroup_get() done in the charge
- * path here, being careful not to race with memcg_uncharge_kmem: it is
- * possible that the charges went down to 0 between mark_dead and the
- * res_counter read, so in that case, we don't need the put
- */
if (memcg_kmem_test_and_clear_dead(memcg))
- mem_cgroup_put(memcg);
+ css_put(&memcg->css);
}
#else
static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
@@ -5917,7 +5940,11 @@ static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
return 0;
}
-static void kmem_cgroup_destroy(struct mem_cgroup *memcg)
+static void memcg_destroy_kmem(struct mem_cgroup *memcg)
+{
+}
+
+static void kmem_cgroup_css_offline(struct mem_cgroup *memcg)
{
}
#endif
@@ -6087,13 +6114,13 @@ static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
mz->on_tree = false;
mz->memcg = memcg;
}
- memcg->info.nodeinfo[node] = pn;
+ memcg->nodeinfo[node] = pn;
return 0;
}
static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node)
{
- kfree(memcg->info.nodeinfo[node]);
+ kfree(memcg->nodeinfo[node]);
}
static struct mem_cgroup *mem_cgroup_alloc(void)
@@ -6166,49 +6193,6 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg)
vfree(memcg);
}
-
-/*
- * Helpers for freeing a kmalloc()ed/vzalloc()ed mem_cgroup by RCU,
- * but in process context. The work_freeing structure is overlaid
- * on the rcu_freeing structure, which itself is overlaid on memsw.
- */
-static void free_work(struct work_struct *work)
-{
- struct mem_cgroup *memcg;
-
- memcg = container_of(work, struct mem_cgroup, work_freeing);
- __mem_cgroup_free(memcg);
-}
-
-static void free_rcu(struct rcu_head *rcu_head)
-{
- struct mem_cgroup *memcg;
-
- memcg = container_of(rcu_head, struct mem_cgroup, rcu_freeing);
- INIT_WORK(&memcg->work_freeing, free_work);
- schedule_work(&memcg->work_freeing);
-}
-
-static void mem_cgroup_get(struct mem_cgroup *memcg)
-{
- atomic_inc(&memcg->refcnt);
-}
-
-static void __mem_cgroup_put(struct mem_cgroup *memcg, int count)
-{
- if (atomic_sub_and_test(count, &memcg->refcnt)) {
- struct mem_cgroup *parent = parent_mem_cgroup(memcg);
- call_rcu(&memcg->rcu_freeing, free_rcu);
- if (parent)
- mem_cgroup_put(parent);
- }
-}
-
-static void mem_cgroup_put(struct mem_cgroup *memcg)
-{
- __mem_cgroup_put(memcg, 1);
-}
-
/*
* Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled.
*/
@@ -6268,7 +6252,6 @@ mem_cgroup_css_alloc(struct cgroup *cont)
memcg->last_scanned_node = MAX_NUMNODES;
INIT_LIST_HEAD(&memcg->oom_notify);
- atomic_set(&memcg->refcnt, 1);
memcg->move_charge_at_immigrate = 0;
mutex_init(&memcg->thresholds_lock);
spin_lock_init(&memcg->move_lock);
@@ -6304,12 +6287,9 @@ mem_cgroup_css_online(struct cgroup *cont)
res_counter_init(&memcg->kmem, &parent->kmem);
/*
- * We increment refcnt of the parent to ensure that we can
- * safely access it on res_counter_charge/uncharge.
- * This refcnt will be decremented when freeing this
- * mem_cgroup(see mem_cgroup_put).
+ * No need to take a reference to the parent because cgroup
+ * core guarantees its existence.
*/
- mem_cgroup_get(parent);
} else {
res_counter_init(&memcg->res, NULL);
res_counter_init(&memcg->memsw, NULL);
@@ -6325,16 +6305,6 @@ mem_cgroup_css_online(struct cgroup *cont)
error = memcg_init_kmem(memcg, &mem_cgroup_subsys);
mutex_unlock(&memcg_create_mutex);
- if (error) {
- /*
- * We call put now because our (and parent's) refcnts
- * are already in place. mem_cgroup_put() will internally
- * call __mem_cgroup_free, so return directly
- */
- mem_cgroup_put(memcg);
- if (parent->use_hierarchy)
- mem_cgroup_put(parent);
- }
return error;
}
@@ -6360,6 +6330,8 @@ static void mem_cgroup_css_offline(struct cgroup *cont)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
+ kmem_cgroup_css_offline(memcg);
+
mem_cgroup_invalidate_reclaim_iterators(memcg);
mem_cgroup_reparent_charges(memcg);
mem_cgroup_destroy_all_caches(memcg);
@@ -6369,9 +6341,8 @@ static void mem_cgroup_css_free(struct cgroup *cont)
{
struct mem_cgroup *memcg = mem_cgroup_from_cont(cont);
- kmem_cgroup_destroy(memcg);
-
- mem_cgroup_put(memcg);
+ memcg_destroy_kmem(memcg);
+ __mem_cgroup_free(memcg);
}
#ifdef CONFIG_MMU
@@ -6680,6 +6651,7 @@ static void __mem_cgroup_clear_mc(void)
{
struct mem_cgroup *from = mc.from;
struct mem_cgroup *to = mc.to;
+ int i;
/* we must uncharge all the leftover precharges from mc.to */
if (mc.precharge) {
@@ -6700,7 +6672,9 @@ static void __mem_cgroup_clear_mc(void)
if (!mem_cgroup_is_root(mc.from))
res_counter_uncharge(&mc.from->memsw,
PAGE_SIZE * mc.moved_swap);
- __mem_cgroup_put(mc.from, mc.moved_swap);
+
+ for (i = 0; i < mc.moved_swap; i++)
+ css_put(&mc.from->css);
if (!mem_cgroup_is_root(mc.to)) {
/*
@@ -6710,7 +6684,7 @@ static void __mem_cgroup_clear_mc(void)
res_counter_uncharge(&mc.to->res,
PAGE_SIZE * mc.moved_swap);
}
- /* we've already done mem_cgroup_get(mc.to) */
+ /* we've already done css_get(mc.to) */
mc.moved_swap = 0;
}
memcg_oom_recover(from);
diff --git a/mm/memory.c b/mm/memory.c
index b68812d..1ce2e2a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1150,7 +1150,7 @@ again:
if (pte_dirty(ptent))
set_page_dirty(page);
if (pte_young(ptent) &&
- likely(!VM_SequentialReadHint(vma)))
+ likely(!(vma->vm_flags & VM_SEQ_READ)))
mark_page_accessed(page);
rss[MM_FILEPAGES]--;
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index f5ba127..ca1dd3a 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -208,13 +208,13 @@ void register_page_bootmem_info_node(struct pglist_data *pgdat)
pfn = pgdat->node_start_pfn;
end_pfn = pgdat_end_pfn(pgdat);
- /* register_section info */
+ /* register section info */
for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) {
/*
* Some platforms can assign the same pfn to multiple nodes - on
* node0 as well as nodeN. To avoid registering a pfn against
* multiple nodes we check that this pfn does not already
- * reside in some other node.
+ * reside in some other nodes.
*/
if (pfn_valid(pfn) && (pfn_to_nid(pfn) == node))
register_page_bootmem_info_section(pfn);
@@ -914,19 +914,19 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, int online_typ
if ((zone_idx(zone) > ZONE_NORMAL || online_type == ONLINE_MOVABLE) &&
!can_online_high_movable(zone)) {
unlock_memory_hotplug();
- return -1;
+ return -EINVAL;
}
if (online_type == ONLINE_KERNEL && zone_idx(zone) == ZONE_MOVABLE) {
if (move_pfn_range_left(zone - 1, zone, pfn, pfn + nr_pages)) {
unlock_memory_hotplug();
- return -1;
+ return -EINVAL;
}
}
if (online_type == ONLINE_MOVABLE && zone_idx(zone) == ZONE_MOVABLE - 1) {
if (move_pfn_range_right(zone, zone + 1, pfn, pfn + nr_pages)) {
unlock_memory_hotplug();
- return -1;
+ return -EINVAL;
}
}
diff --git a/mm/mmap.c b/mm/mmap.c
index 8468ffd..f813111 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1358,18 +1358,19 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
if (!(flags & MAP_ANONYMOUS)) {
audit_mmap_fd(fd, flags);
- if (unlikely(flags & MAP_HUGETLB))
- return -EINVAL;
file = fget(fd);
if (!file)
goto out;
if (is_file_hugepages(file))
len = ALIGN(len, huge_page_size(hstate_file(file)));
+ retval = -EINVAL;
+ if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
+ goto out_fput;
} else if (flags & MAP_HUGETLB) {
struct user_struct *user = NULL;
- struct hstate *hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) &
- SHM_HUGE_MASK);
+ struct hstate *hs;
+ hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & SHM_HUGE_MASK);
if (!hs)
return -EINVAL;
@@ -1391,6 +1392,7 @@ SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);
retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
+out_fput:
if (file)
fput(file);
out:
diff --git a/mm/mremap.c b/mm/mremap.c
index 3708655..457d34e 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -456,13 +456,14 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
unsigned long charged = 0;
bool locked = false;
- down_write(&current->mm->mmap_sem);
-
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
- goto out;
+ return ret;
+
+ if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
+ return ret;
if (addr & ~PAGE_MASK)
- goto out;
+ return ret;
old_len = PAGE_ALIGN(old_len);
new_len = PAGE_ALIGN(new_len);
@@ -473,12 +474,13 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
* a zero new-len is nonsensical.
*/
if (!new_len)
- goto out;
+ return ret;
+
+ down_write(&current->mm->mmap_sem);
if (flags & MREMAP_FIXED) {
- if (flags & MREMAP_MAYMOVE)
- ret = mremap_to(addr, old_len, new_addr, new_len,
- &locked);
+ ret = mremap_to(addr, old_len, new_addr, new_len,
+ &locked);
goto out;
}
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 327516b..b100255 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -204,6 +204,7 @@ static char * const zone_names[MAX_NR_ZONES] = {
};
int min_free_kbytes = 1024;
+int user_min_free_kbytes;
static unsigned long __meminitdata nr_kernel_pages;
static unsigned long __meminitdata nr_all_pages;
@@ -1046,7 +1047,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype)
* MIGRATE_CMA areas.
*/
if (!is_migrate_cma(migratetype) &&
- (unlikely(current_order >= pageblock_order / 2) ||
+ (current_order >= pageblock_order / 2 ||
start_migratetype == MIGRATE_RECLAIMABLE ||
page_group_by_mobility_disabled)) {
int pages;
@@ -3153,12 +3154,10 @@ static void zoneref_set_zone(struct zone *zone, struct zoneref *zoneref)
* Add all populated zones of a node to the zonelist.
*/
static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
- int nr_zones, enum zone_type zone_type)
+ int nr_zones)
{
struct zone *zone;
-
- BUG_ON(zone_type >= MAX_NR_ZONES);
- zone_type++;
+ enum zone_type zone_type = MAX_NR_ZONES;
do {
zone_type--;
@@ -3168,8 +3167,8 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
&zonelist->_zonerefs[nr_zones++]);
check_highest_zone(zone_type);
}
-
} while (zone_type);
+
return nr_zones;
}
@@ -3363,8 +3362,7 @@ static void build_zonelists_in_node_order(pg_data_t *pgdat, int node)
zonelist = &pgdat->node_zonelists[0];
for (j = 0; zonelist->_zonerefs[j].zone != NULL; j++)
;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j,
- MAX_NR_ZONES - 1);
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
}
@@ -3378,7 +3376,7 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
struct zonelist *zonelist;
zonelist = &pgdat->node_zonelists[1];
- j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
+ j = build_zonelists_node(pgdat, zonelist, 0);
zonelist->_zonerefs[j].zone = NULL;
zonelist->_zonerefs[j].zone_idx = 0;
}
@@ -3586,7 +3584,7 @@ static void build_zonelists(pg_data_t *pgdat)
local_node = pgdat->node_id;
zonelist = &pgdat->node_zonelists[0];
- j = build_zonelists_node(pgdat, zonelist, 0, MAX_NR_ZONES - 1);
+ j = build_zonelists_node(pgdat, zonelist, 0);
/*
* Now we build the zonelist so that it contains the zones
@@ -3599,14 +3597,12 @@ static void build_zonelists(pg_data_t *pgdat)
for (node = local_node + 1; node < MAX_NUMNODES; node++) {
if (!node_online(node))
continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j,
- MAX_NR_ZONES - 1);
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j);
}
for (node = 0; node < local_node; node++) {
if (!node_online(node))
continue;
- j = build_zonelists_node(NODE_DATA(node), zonelist, j,
- MAX_NR_ZONES - 1);
+ j = build_zonelists_node(NODE_DATA(node), zonelist, j);
}
zonelist->_zonerefs[j].zone = NULL;
@@ -4421,13 +4417,13 @@ static void __meminit adjust_zone_range_for_zone_movable(int nid,
*/
static unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
unsigned long *ignored)
{
- unsigned long node_start_pfn, node_end_pfn;
unsigned long zone_start_pfn, zone_end_pfn;
- /* Get the start and end of the node and zone */
- get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
+ /* Get the start and end of the zone */
zone_start_pfn = arch_zone_lowest_possible_pfn[zone_type];
zone_end_pfn = arch_zone_highest_possible_pfn[zone_type];
adjust_zone_range_for_zone_movable(nid, zone_type,
@@ -4482,14 +4478,14 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
/* Return the number of page frames in holes in a zone on a node */
static unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
unsigned long *ignored)
{
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
- unsigned long node_start_pfn, node_end_pfn;
unsigned long zone_start_pfn, zone_end_pfn;
- get_pfn_range_for_nid(nid, &node_start_pfn, &node_end_pfn);
zone_start_pfn = clamp(node_start_pfn, zone_low, zone_high);
zone_end_pfn = clamp(node_end_pfn, zone_low, zone_high);
@@ -4502,6 +4498,8 @@ static unsigned long __meminit zone_absent_pages_in_node(int nid,
#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
unsigned long *zones_size)
{
return zones_size[zone_type];
@@ -4509,6 +4507,8 @@ static inline unsigned long __meminit zone_spanned_pages_in_node(int nid,
static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
unsigned long zone_type,
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
unsigned long *zholes_size)
{
if (!zholes_size)
@@ -4520,21 +4520,27 @@ static inline unsigned long __meminit zone_absent_pages_in_node(int nid,
#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
static void __meminit calculate_node_totalpages(struct pglist_data *pgdat,
- unsigned long *zones_size, unsigned long *zholes_size)
+ unsigned long node_start_pfn,
+ unsigned long node_end_pfn,
+ unsigned long *zones_size,
+ unsigned long *zholes_size)
{
unsigned long realtotalpages, totalpages = 0;
enum zone_type i;
for (i = 0; i < MAX_NR_ZONES; i++)
totalpages += zone_spanned_pages_in_node(pgdat->node_id, i,
- zones_size);
+ node_start_pfn,
+ node_end_pfn,
+ zones_size);
pgdat->node_spanned_pages = totalpages;
realtotalpages = totalpages;
for (i = 0; i < MAX_NR_ZONES; i++)
realtotalpages -=
zone_absent_pages_in_node(pgdat->node_id, i,
- zholes_size);
+ node_start_pfn, node_end_pfn,
+ zholes_size);
pgdat->node_present_pages = realtotalpages;
printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id,
realtotalpages);
@@ -4643,6 +4649,7 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages,
* NOTE: pgdat should get zeroed by caller.
*/
static void __paginginit free_area_init_core(struct pglist_data *pgdat,
+ unsigned long node_start_pfn, unsigned long node_end_pfn,
unsigned long *zones_size, unsigned long *zholes_size)
{
enum zone_type j;
@@ -4664,8 +4671,11 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat,
struct zone *zone = pgdat->node_zones + j;
unsigned long size, realsize, freesize, memmap_pages;
- size = zone_spanned_pages_in_node(nid, j, zones_size);
+ size = zone_spanned_pages_in_node(nid, j, node_start_pfn,
+ node_end_pfn, zones_size);
realsize = freesize = size - zone_absent_pages_in_node(nid, j,
+ node_start_pfn,
+ node_end_pfn,
zholes_size);
/*
@@ -4779,6 +4789,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
unsigned long node_start_pfn, unsigned long *zholes_size)
{
pg_data_t *pgdat = NODE_DATA(nid);
+ unsigned long start_pfn = 0;
+ unsigned long end_pfn = 0;
/* pg_data_t should be reset to zero when it's allocated */
WARN_ON(pgdat->nr_zones || pgdat->classzone_idx);
@@ -4786,7 +4798,11 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
pgdat->node_id = nid;
pgdat->node_start_pfn = node_start_pfn;
init_zone_allows_reclaim(nid);
- calculate_node_totalpages(pgdat, zones_size, zholes_size);
+#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+#endif
+ calculate_node_totalpages(pgdat, start_pfn, end_pfn,
+ zones_size, zholes_size);
alloc_node_mem_map(pgdat);
#ifdef CONFIG_FLAT_NODE_MEM_MAP
@@ -4795,7 +4811,8 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size,
(unsigned long)pgdat->node_mem_map);
#endif
- free_area_init_core(pgdat, zones_size, zholes_size);
+ free_area_init_core(pgdat, start_pfn, end_pfn,
+ zones_size, zholes_size);
}
#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
@@ -5573,14 +5590,21 @@ static void __meminit setup_per_zone_inactive_ratio(void)
int __meminit init_per_zone_wmark_min(void)
{
unsigned long lowmem_kbytes;
+ int new_min_free_kbytes;
lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
-
- min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
- if (min_free_kbytes < 128)
- min_free_kbytes = 128;
- if (min_free_kbytes > 65536)
- min_free_kbytes = 65536;
+ new_min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
+
+ if (new_min_free_kbytes > user_min_free_kbytes) {
+ min_free_kbytes = new_min_free_kbytes;
+ if (min_free_kbytes < 128)
+ min_free_kbytes = 128;
+ if (min_free_kbytes > 65536)
+ min_free_kbytes = 65536;
+ } else {
+ pr_warn("min_free_kbytes is not updated to %d because user defined value %d is preferred\n",
+ new_min_free_kbytes, user_min_free_kbytes);
+ }
setup_per_zone_wmarks();
refresh_zone_stat_thresholds();
setup_per_zone_lowmem_reserve();
@@ -5598,8 +5622,10 @@ int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
void __user *buffer, size_t *length, loff_t *ppos)
{
proc_dointvec(table, write, buffer, length, ppos);
- if (write)
+ if (write) {
+ user_min_free_kbytes = min_free_kbytes;
setup_per_zone_wmarks();
+ }
return 0;
}
diff --git a/mm/rmap.c b/mm/rmap.c
index e22ceeb..cd356df 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -720,7 +720,7 @@ int page_referenced_one(struct page *page, struct vm_area_struct *vma,
* mapping is already gone, the unmap path will have
* set PG_referenced or activated the page.
*/
- if (likely(!VM_SequentialReadHint(vma)))
+ if (likely(!(vma->vm_flags & VM_SEQ_READ)))
referenced++;
}
pte_unmap_unlock(pte, ptl);
diff --git a/mm/sparse.c b/mm/sparse.c
index b38400f..308d5033 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -753,6 +753,7 @@ out:
return ret;
}
+#ifdef CONFIG_MEMORY_HOTREMOVE
#ifdef CONFIG_MEMORY_FAILURE
static void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
{
@@ -774,7 +775,6 @@ static inline void clear_hwpoisoned_pages(struct page *memmap, int nr_pages)
}
#endif
-#ifdef CONFIG_MEMORY_HOTREMOVE
static void free_section_usemap(struct page *memmap, unsigned long *usemap)
{
struct page *usemap_page;
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 91a1047..13a5495 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -388,12 +388,12 @@ nocache:
addr = ALIGN(first->va_end, align);
if (addr < vstart)
goto nocache;
- if (addr + size - 1 < addr)
+ if (addr + size < addr)
goto overflow;
} else {
addr = ALIGN(vstart, align);
- if (addr + size - 1 < addr)
+ if (addr + size < addr)
goto overflow;
n = vmap_area_root.rb_node;
@@ -420,7 +420,7 @@ nocache:
if (addr + cached_hole_size < first->va_start)
cached_hole_size = first->va_start - addr;
addr = ALIGN(first->va_end, align);
- if (addr + size - 1 < addr)
+ if (addr + size < addr)
goto overflow;
if (list_is_last(&first->list, &vmap_area_list))
@@ -754,7 +754,6 @@ struct vmap_block {
struct vmap_area *va;
struct vmap_block_queue *vbq;
unsigned long free, dirty;
- DECLARE_BITMAP(alloc_map, VMAP_BBMAP_BITS);
DECLARE_BITMAP(dirty_map, VMAP_BBMAP_BITS);
struct list_head free_list;
struct rcu_head rcu_head;
@@ -820,7 +819,6 @@ static struct vmap_block *new_vmap_block(gfp_t gfp_mask)
vb->va = va;
vb->free = VMAP_BBMAP_BITS;
vb->dirty = 0;
- bitmap_zero(vb->alloc_map, VMAP_BBMAP_BITS);
bitmap_zero(vb->dirty_map, VMAP_BBMAP_BITS);
INIT_LIST_HEAD(&vb->free_list);
@@ -873,7 +871,6 @@ static void purge_fragmented_blocks(int cpu)
if (vb->free + vb->dirty == VMAP_BBMAP_BITS && vb->dirty != VMAP_BBMAP_BITS) {
vb->free = 0; /* prevent further allocs after releasing lock */
vb->dirty = VMAP_BBMAP_BITS; /* prevent purging it again */
- bitmap_fill(vb->alloc_map, VMAP_BBMAP_BITS);
bitmap_fill(vb->dirty_map, VMAP_BBMAP_BITS);
spin_lock(&vbq->lock);
list_del_rcu(&vb->free_list);
@@ -891,11 +888,6 @@ static void purge_fragmented_blocks(int cpu)
}
}
-static void purge_fragmented_blocks_thiscpu(void)
-{
- purge_fragmented_blocks(smp_processor_id());
-}
-
static void purge_fragmented_blocks_allcpus(void)
{
int cpu;
@@ -910,7 +902,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
struct vmap_block *vb;
unsigned long addr = 0;
unsigned int order;
- int purge = 0;
BUG_ON(size & ~PAGE_MASK);
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
@@ -934,17 +925,7 @@ again:
if (vb->free < 1UL << order)
goto next;
- i = bitmap_find_free_region(vb->alloc_map,
- VMAP_BBMAP_BITS, order);
-
- if (i < 0) {
- if (vb->free + vb->dirty == VMAP_BBMAP_BITS) {
- /* fragmented and no outstanding allocations */
- BUG_ON(vb->dirty != VMAP_BBMAP_BITS);
- purge = 1;
- }
- goto next;
- }
+ i = VMAP_BBMAP_BITS - vb->free;
addr = vb->va->va_start + (i << PAGE_SHIFT);
BUG_ON(addr_to_vb_idx(addr) !=
addr_to_vb_idx(vb->va->va_start));
@@ -960,9 +941,6 @@ next:
spin_unlock(&vb->lock);
}
- if (purge)
- purge_fragmented_blocks_thiscpu();
-
put_cpu_var(vmap_block_queue);
rcu_read_unlock();
@@ -1311,15 +1289,15 @@ static void setup_vmalloc_vm(struct vm_struct *vm, struct vmap_area *va,
spin_unlock(&vmap_area_lock);
}
-static void clear_vm_unlist(struct vm_struct *vm)
+static void clear_vm_uninitialized_flag(struct vm_struct *vm)
{
/*
- * Before removing VM_UNLIST,
+ * Before removing VM_UNINITIALIZED,
* we should make sure that vm has proper values.
* Pair with smp_rmb() in show_numa_info().
*/
smp_wmb();
- vm->flags &= ~VM_UNLIST;
+ vm->flags &= ~VM_UNINITIALIZED;
}
static struct vm_struct *__get_vm_area_node(unsigned long size,
@@ -1453,7 +1431,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
return;
if (WARN(!PAGE_ALIGNED(addr), "Trying to vfree() bad address (%p)\n",
- addr));
+ addr))
return;
area = remove_vm_area(addr);
@@ -1499,7 +1477,6 @@ static void __vunmap(const void *addr, int deallocate_pages)
* conventions for vfree() arch-depenedent would be a really bad idea)
*
* NOTE: assumes that the object at *addr has a size >= sizeof(llist_node)
- *
*/
void vfree(const void *addr)
{
@@ -1511,8 +1488,8 @@ void vfree(const void *addr)
return;
if (unlikely(in_interrupt())) {
struct vfree_deferred *p = &__get_cpu_var(vfree_deferred);
- llist_add((struct llist_node *)addr, &p->list);
- schedule_work(&p->wq);
+ if (llist_add((struct llist_node *)addr, &p->list))
+ schedule_work(&p->wq);
} else
__vunmap(addr, 1);
}
@@ -1657,21 +1634,21 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align,
if (!size || (size >> PAGE_SHIFT) > totalram_pages)
goto fail;
- area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNLIST,
+ area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED,
start, end, node, gfp_mask, caller);
if (!area)
goto fail;
addr = __vmalloc_area_node(area, gfp_mask, prot, node, caller);
if (!addr)
- return NULL;
+ goto fail;
/*
- * In this function, newly allocated vm_struct has VM_UNLIST flag.
- * It means that vm_struct is not fully initialized.
+ * In this function, newly allocated vm_struct has VM_UNINITIALIZED
+ * flag. It means that vm_struct is not fully initialized.
* Now, it is fully initialized, so remove this flag here.
*/
- clear_vm_unlist(area);
+ clear_vm_uninitialized_flag(area);
/*
* A ref_count = 3 is needed because the vm_struct and vmap_area
@@ -2591,11 +2568,6 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
if (!counters)
return;
- /* Pair with smp_wmb() in clear_vm_unlist() */
- smp_rmb();
- if (v->flags & VM_UNLIST)
- return;
-
memset(counters, 0, nr_node_ids * sizeof(unsigned int));
for (nr = 0; nr < v->nr_pages; nr++)
@@ -2624,6 +2596,11 @@ static int s_show(struct seq_file *m, void *p)
v = va->vm;
+ /* Pair with smp_wmb() in clear_vm_uninitialized_flag() */
+ smp_rmb();
+ if (v->flags & VM_UNINITIALIZED)
+ return 0;
+
seq_printf(m, "0x%pK-0x%pK %7ld",
v->addr, v->addr + v->size, v->size);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 99b3ac7..2cff0d4 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1443,25 +1443,11 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
* as there is no guarantee the dirtying process is throttled in the
* same way balance_dirty_pages() manages.
*
- * This scales the number of dirty pages that must be under writeback
- * before a zone gets flagged ZONE_WRITEBACK. It is a simple backoff
- * function that has the most effect in the range DEF_PRIORITY to
- * DEF_PRIORITY-2 which is the priority reclaim is considered to be
- * in trouble and reclaim is considered to be in trouble.
- *
- * DEF_PRIORITY 100% isolated pages must be PageWriteback to throttle
- * DEF_PRIORITY-1 50% must be PageWriteback
- * DEF_PRIORITY-2 25% must be PageWriteback, kswapd in trouble
- * ...
- * DEF_PRIORITY-6 For SWAP_CLUSTER_MAX isolated pages, throttle if any
- * isolated page is PageWriteback
- *
* Once a zone is flagged ZONE_WRITEBACK, kswapd will count the number
* of pages under pages flagged for immediate reclaim and stall if any
* are encountered in the nr_immediate check below.
*/
- if (nr_writeback && nr_writeback >=
- (nr_taken >> (DEF_PRIORITY - sc->priority)))
+ if (nr_writeback && nr_writeback == nr_taken)
zone_set_flag(zone, ZONE_WRITEBACK);
/*
@@ -2361,8 +2347,10 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
aborted_reclaim = shrink_zones(zonelist, sc);
/*
- * Don't shrink slabs when reclaiming memory from
- * over limit cgroups
+ * Don't shrink slabs when reclaiming memory from over limit
+ * cgroups but do shrink slab at least once when aborting
+ * reclaim for compaction to avoid unevenly scanning file/anon
+ * LRU pages over slab pages.
*/
if (global_reclaim(sc)) {
unsigned long lru_pages = 0;
@@ -2404,7 +2392,7 @@ static unsigned long do_try_to_free_pages(struct zonelist *zonelist,
WB_REASON_TRY_TO_FREE_PAGES);
sc->may_writepage = 1;
}
- } while (--sc->priority >= 0);
+ } while (--sc->priority >= 0 && !aborted_reclaim);
out:
delayacct_freepages_end();
OpenPOWER on IntegriCloud