From 1edf223485c42c99655dcd001db1e46ad5e5d2d7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:06:57 -0800 Subject: mm/page-writeback.c: make determine_dirtyable_memory static again The tracing ring-buffer used this function briefly, but not anymore. Make it local to the writeback code again. Also, move the function so that no forward declaration needs to be reintroduced. Signed-off-by: Johannes Weiner Acked-by: Mel Gorman Reviewed-by: Michal Hocko Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 122 ++++++++++++++++++++++++++-------------------------- 1 file changed, 60 insertions(+), 62 deletions(-) (limited to 'mm/page-writeback.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 8616ef3..c081bf6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -130,6 +130,66 @@ unsigned long global_dirty_limit; static struct prop_descriptor vm_completions; /* + * Work out the current dirty-memory clamping and background writeout + * thresholds. + * + * The main aim here is to lower them aggressively if there is a lot of mapped + * memory around. To avoid stressing page reclaim with lots of unreclaimable + * pages. It is better to clamp down on writers than to start swapping, and + * performing lots of scanning. + * + * We only allow 1/2 of the currently-unmapped memory to be dirtied. + * + * We don't permit the clamping level to fall below 5% - that is getting rather + * excessive. + * + * We make sure that the background writeout level is below the adjusted + * clamping level. + */ +static unsigned long highmem_dirtyable_memory(unsigned long total) +{ +#ifdef CONFIG_HIGHMEM + int node; + unsigned long x = 0; + + for_each_node_state(node, N_HIGH_MEMORY) { + struct zone *z = + &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; + + x += zone_page_state(z, NR_FREE_PAGES) + + zone_reclaimable_pages(z); + } + /* + * Make sure that the number of highmem pages is never larger + * than the number of the total dirtyable memory. This can only + * occur in very strange VM situations but we want to make sure + * that this does not occur. + */ + return min(x, total); +#else + return 0; +#endif +} + +/** + * determine_dirtyable_memory - amount of memory that may be used + * + * Returns the numebr of pages that can currently be freed and used + * by the kernel for direct mappings. + */ +static unsigned long determine_dirtyable_memory(void) +{ + unsigned long x; + + x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); + + if (!vm_highmem_is_dirtyable) + x -= highmem_dirtyable_memory(x); + + return x + 1; /* Ensure that we never return 0 */ +} + +/* * couple the period to the dirty_ratio: * * period/2 ~ roundup_pow_of_two(dirty limit) @@ -196,7 +256,6 @@ int dirty_ratio_handler(struct ctl_table *table, int write, return ret; } - int dirty_bytes_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) @@ -291,67 +350,6 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) } EXPORT_SYMBOL(bdi_set_max_ratio); -/* - * Work out the current dirty-memory clamping and background writeout - * thresholds. - * - * The main aim here is to lower them aggressively if there is a lot of mapped - * memory around. To avoid stressing page reclaim with lots of unreclaimable - * pages. It is better to clamp down on writers than to start swapping, and - * performing lots of scanning. - * - * We only allow 1/2 of the currently-unmapped memory to be dirtied. - * - * We don't permit the clamping level to fall below 5% - that is getting rather - * excessive. - * - * We make sure that the background writeout level is below the adjusted - * clamping level. - */ - -static unsigned long highmem_dirtyable_memory(unsigned long total) -{ -#ifdef CONFIG_HIGHMEM - int node; - unsigned long x = 0; - - for_each_node_state(node, N_HIGH_MEMORY) { - struct zone *z = - &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; - - x += zone_page_state(z, NR_FREE_PAGES) + - zone_reclaimable_pages(z); - } - /* - * Make sure that the number of highmem pages is never larger - * than the number of the total dirtyable memory. This can only - * occur in very strange VM situations but we want to make sure - * that this does not occur. - */ - return min(x, total); -#else - return 0; -#endif -} - -/** - * determine_dirtyable_memory - amount of memory that may be used - * - * Returns the numebr of pages that can currently be freed and used - * by the kernel for direct mappings. - */ -unsigned long determine_dirtyable_memory(void) -{ - unsigned long x; - - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); - - if (!vm_highmem_is_dirtyable) - x -= highmem_dirtyable_memory(x); - - return x + 1; /* Ensure that we never return 0 */ -} - static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { -- cgit v1.1 From ab8fabd46f811d5153d8a0cd2fac9a0d41fb593d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:07:42 -0800 Subject: mm: exclude reserved pages from dirtyable memory Per-zone dirty limits try to distribute page cache pages allocated for writing across zones in proportion to the individual zone sizes, to reduce the likelihood of reclaim having to write back individual pages from the LRU lists in order to make progress. This patch: The amount of dirtyable pages should not include the full number of free pages: there is a number of reserved pages that the page allocator and kswapd always try to keep free. The closer (reclaimable pages - dirty pages) is to the number of reserved pages, the more likely it becomes for reclaim to run into dirty pages: +----------+ --- | anon | | +----------+ | | | | | | -- dirty limit new -- flusher new | file | | | | | | | | | -- dirty limit old -- flusher old | | | +----------+ --- reclaim | reserved | +----------+ | kernel | +----------+ This patch introduces a per-zone dirty reserve that takes both the lowmem reserve as well as the high watermark of the zone into account, and a global sum of those per-zone values that is subtracted from the global amount of dirtyable pages. The lowmem reserve is unavailable to page cache allocations and kswapd tries to keep the high watermark free. We don't want to end up in a situation where reclaim has to clean pages in order to balance zones. Not treating reserved pages as dirtyable on a global level is only a conceptual fix. In reality, dirty pages are not distributed equally across zones and reclaim runs into dirty pages on a regular basis. But it is important to get this right before tackling the problem on a per-zone level, where the distance between reclaim and the dirty pages is mostly much smaller in absolute numbers. [akpm@linux-foundation.org: fix highmem build] Signed-off-by: Johannes Weiner Reviewed-by: Rik van Riel Reviewed-by: Michal Hocko Reviewed-by: Minchan Kim Acked-by: Mel Gorman Cc: KAMEZAWA Hiroyuki Cc: Christoph Hellwig Cc: Wu Fengguang Cc: Dave Chinner Cc: Jan Kara Cc: Shaohua Li Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm/page-writeback.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index c081bf6..9ab6de8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -157,7 +157,7 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) &NODE_DATA(node)->node_zones[ZONE_HIGHMEM]; x += zone_page_state(z, NR_FREE_PAGES) + - zone_reclaimable_pages(z); + zone_reclaimable_pages(z) - z->dirty_balance_reserve; } /* * Make sure that the number of highmem pages is never larger @@ -181,7 +181,8 @@ static unsigned long determine_dirtyable_memory(void) { unsigned long x; - x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); + x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages() - + dirty_balance_reserve; if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); -- cgit v1.1 From ccafa2879fb8d13b8031337a8743eac4189e5d6e Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:07:44 -0800 Subject: mm: writeback: cleanups in preparation for per-zone dirty limits The next patch will introduce per-zone dirty limiting functions in addition to the traditional global dirty limiting. Rename determine_dirtyable_memory() to global_dirtyable_memory() before adding the zone-specific version, and fix up its documentation. Also, move the functions to determine the dirtyable memory and the function to calculate the dirty limit based on that together so that their relationship is more apparent and that they can be commented on as a group. Signed-off-by: Johannes Weiner Reviewed-by: Minchan Kim Acked-by: Mel Gorman Reviewed-by: Michal Hocko Cc: KAMEZAWA Hiroyuki Cc: Christoph Hellwig Cc: Wu Fengguang Cc: Dave Chinner Cc: Jan Kara Cc: Shaohua Li Cc: Rik van Riel Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 93 +++++++++++++++++++++++++++-------------------------- 1 file changed, 47 insertions(+), 46 deletions(-) (limited to 'mm/page-writeback.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 9ab6de8..433fa99 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -146,6 +146,7 @@ static struct prop_descriptor vm_completions; * We make sure that the background writeout level is below the adjusted * clamping level. */ + static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM @@ -172,12 +173,12 @@ static unsigned long highmem_dirtyable_memory(unsigned long total) } /** - * determine_dirtyable_memory - amount of memory that may be used + * global_dirtyable_memory - number of globally dirtyable pages * - * Returns the numebr of pages that can currently be freed and used - * by the kernel for direct mappings. + * Returns the global number of pages potentially available for dirty + * page cache. This is the base value for the global dirty limits. */ -static unsigned long determine_dirtyable_memory(void) +unsigned long global_dirtyable_memory(void) { unsigned long x; @@ -191,6 +192,47 @@ static unsigned long determine_dirtyable_memory(void) } /* + * global_dirty_limits - background-writeback and dirty-throttling thresholds + * + * Calculate the dirty thresholds based on sysctl parameters + * - vm.dirty_background_ratio or vm.dirty_background_bytes + * - vm.dirty_ratio or vm.dirty_bytes + * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and + * real-time tasks. + */ +void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) +{ + unsigned long background; + unsigned long dirty; + unsigned long uninitialized_var(available_memory); + struct task_struct *tsk; + + if (!vm_dirty_bytes || !dirty_background_bytes) + available_memory = global_dirtyable_memory(); + + if (vm_dirty_bytes) + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); + else + dirty = (vm_dirty_ratio * available_memory) / 100; + + if (dirty_background_bytes) + background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); + else + background = (dirty_background_ratio * available_memory) / 100; + + if (background >= dirty) + background = dirty / 2; + tsk = current; + if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { + background += background / 4; + dirty += dirty / 4; + } + *pbackground = background; + *pdirty = dirty; + trace_global_dirty_state(background, dirty); +} + +/* * couple the period to the dirty_ratio: * * period/2 ~ roundup_pow_of_two(dirty limit) @@ -202,7 +244,7 @@ static int calc_period_shift(void) if (vm_dirty_bytes) dirty_total = vm_dirty_bytes / PAGE_SIZE; else - dirty_total = (vm_dirty_ratio * determine_dirtyable_memory()) / + dirty_total = (vm_dirty_ratio * global_dirtyable_memory()) / 100; return 2 + ilog2(dirty_total - 1); } @@ -362,47 +404,6 @@ static unsigned long hard_dirty_limit(unsigned long thresh) return max(thresh, global_dirty_limit); } -/* - * global_dirty_limits - background-writeback and dirty-throttling thresholds - * - * Calculate the dirty thresholds based on sysctl parameters - * - vm.dirty_background_ratio or vm.dirty_background_bytes - * - vm.dirty_ratio or vm.dirty_bytes - * The dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and - * real-time tasks. - */ -void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) -{ - unsigned long background; - unsigned long dirty; - unsigned long uninitialized_var(available_memory); - struct task_struct *tsk; - - if (!vm_dirty_bytes || !dirty_background_bytes) - available_memory = determine_dirtyable_memory(); - - if (vm_dirty_bytes) - dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE); - else - dirty = (vm_dirty_ratio * available_memory) / 100; - - if (dirty_background_bytes) - background = DIV_ROUND_UP(dirty_background_bytes, PAGE_SIZE); - else - background = (dirty_background_ratio * available_memory) / 100; - - if (background >= dirty) - background = dirty / 2; - tsk = current; - if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) { - background += background / 4; - dirty += dirty / 4; - } - *pbackground = background; - *pdirty = dirty; - trace_global_dirty_state(background, dirty); -} - /** * bdi_dirty_limit - @bdi's share of dirty throttling threshold * @bdi: the backing_dev_info to query -- cgit v1.1 From a756cf5908530e8b40bdf569eb48b40139e8d7fd Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 10 Jan 2012 15:07:49 -0800 Subject: mm: try to distribute dirty pages fairly across zones The maximum number of dirty pages that exist in the system at any time is determined by a number of pages considered dirtyable and a user-configured percentage of those, or an absolute number in bytes. This number of dirtyable pages is the sum of memory provided by all the zones in the system minus their lowmem reserves and high watermarks, so that the system can retain a healthy number of free pages without having to reclaim dirty pages. But there is a flaw in that we have a zoned page allocator which does not care about the global state but rather the state of individual memory zones. And right now there is nothing that prevents one zone from filling up with dirty pages while other zones are spared, which frequently leads to situations where kswapd, in order to restore the watermark of free pages, does indeed have to write pages from that zone's LRU list. This can interfere so badly with IO from the flusher threads that major filesystems (btrfs, xfs, ext4) mostly ignore write requests from reclaim already, taking away the VM's only possibility to keep such a zone balanced, aside from hoping the flushers will soon clean pages from that zone. Enter per-zone dirty limits. They are to a zone's dirtyable memory what the global limit is to the global amount of dirtyable memory, and try to make sure that no single zone receives more than its fair share of the globally allowed dirty pages in the first place. As the number of pages considered dirtyable excludes the zones' lowmem reserves and high watermarks, the maximum number of dirty pages in a zone is such that the zone can always be balanced without requiring page cleaning. As this is a placement decision in the page allocator and pages are dirtied only after the allocation, this patch allows allocators to pass __GFP_WRITE when they know in advance that the page will be written to and become dirty soon. The page allocator will then attempt to allocate from the first zone of the zonelist - which on NUMA is determined by the task's NUMA memory policy - that has not exceeded its dirty limit. At first glance, it would appear that the diversion to lower zones can increase pressure on them, but this is not the case. With a full high zone, allocations will be diverted to lower zones eventually, so it is more of a shift in timing of the lower zone allocations. Workloads that previously could fit their dirty pages completely in the higher zone may be forced to allocate from lower zones, but the amount of pages that "spill over" are limited themselves by the lower zones' dirty constraints, and thus unlikely to become a problem. For now, the problem of unfair dirty page distribution remains for NUMA configurations where the zones allowed for allocation are in sum not big enough to trigger the global dirty limits, wake up the flusher threads and remedy the situation. Because of this, an allocation that could not succeed on any of the considered zones is allowed to ignore the dirty limits before going into direct reclaim or even failing the allocation, until a future patch changes the global dirty throttling and flusher thread activation so that they take individual zone states into account. Test results 15M DMA + 3246M DMA32 + 504 Normal = 3765M memory 40% dirty ratio 16G USB thumb drive 10 runs of dd if=/dev/zero of=disk/zeroes bs=32k count=$((10 << 15)) seconds nr_vmscan_write (stddev) min| median| max xfs vanilla: 549.747( 3.492) 0.000| 0.000| 0.000 patched: 550.996( 3.802) 0.000| 0.000| 0.000 fuse-ntfs vanilla: 1183.094(53.178) 54349.000| 59341.000| 65163.000 patched: 558.049(17.914) 0.000| 0.000| 43.000 btrfs vanilla: 573.679(14.015) 156657.000| 460178.000| 606926.000 patched: 563.365(11.368) 0.000| 0.000| 1362.000 ext4 vanilla: 561.197(15.782) 0.000|2725438.000|4143837.000 patched: 568.806(17.496) 0.000| 0.000| 0.000 Signed-off-by: Johannes Weiner Reviewed-by: Minchan Kim Acked-by: Mel Gorman Reviewed-by: Michal Hocko Tested-by: Wu Fengguang Cc: KAMEZAWA Hiroyuki Cc: Christoph Hellwig Cc: Dave Chinner Cc: Jan Kara Cc: Shaohua Li Cc: Rik van Riel Cc: Chris Mason Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page-writeback.c | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 82 insertions(+) (limited to 'mm/page-writeback.c') diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 433fa99..5cdd4f2 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -147,6 +147,24 @@ static struct prop_descriptor vm_completions; * clamping level. */ +/* + * In a memory zone, there is a certain amount of pages we consider + * available for the page cache, which is essentially the number of + * free and reclaimable pages, minus some zone reserves to protect + * lowmem and the ability to uphold the zone's watermarks without + * requiring writeback. + * + * This number of dirtyable pages is the base value of which the + * user-configurable dirty ratio is the effictive number of pages that + * are allowed to be actually dirtied. Per individual zone, or + * globally by using the sum of dirtyable pages over all zones. + * + * Because the user is allowed to specify the dirty limit globally as + * absolute number of bytes, calculating the per-zone dirty limit can + * require translating the configured limit into a percentage of + * global dirtyable memory first. + */ + static unsigned long highmem_dirtyable_memory(unsigned long total) { #ifdef CONFIG_HIGHMEM @@ -232,6 +250,70 @@ void global_dirty_limits(unsigned long *pbackground, unsigned long *pdirty) trace_global_dirty_state(background, dirty); } +/** + * zone_dirtyable_memory - number of dirtyable pages in a zone + * @zone: the zone + * + * Returns the zone's number of pages potentially available for dirty + * page cache. This is the base value for the per-zone dirty limits. + */ +static unsigned long zone_dirtyable_memory(struct zone *zone) +{ + /* + * The effective global number of dirtyable pages may exclude + * highmem as a big-picture measure to keep the ratio between + * dirty memory and lowmem reasonable. + * + * But this function is purely about the individual zone and a + * highmem zone can hold its share of dirty pages, so we don't + * care about vm_highmem_is_dirtyable here. + */ + return zone_page_state(zone, NR_FREE_PAGES) + + zone_reclaimable_pages(zone) - + zone->dirty_balance_reserve; +} + +/** + * zone_dirty_limit - maximum number of dirty pages allowed in a zone + * @zone: the zone + * + * Returns the maximum number of dirty pages allowed in a zone, based + * on the zone's dirtyable memory. + */ +static unsigned long zone_dirty_limit(struct zone *zone) +{ + unsigned long zone_memory = zone_dirtyable_memory(zone); + struct task_struct *tsk = current; + unsigned long dirty; + + if (vm_dirty_bytes) + dirty = DIV_ROUND_UP(vm_dirty_bytes, PAGE_SIZE) * + zone_memory / global_dirtyable_memory(); + else + dirty = vm_dirty_ratio * zone_memory / 100; + + if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) + dirty += dirty / 4; + + return dirty; +} + +/** + * zone_dirty_ok - tells whether a zone is within its dirty limits + * @zone: the zone to check + * + * Returns %true when the dirty pages in @zone are within the zone's + * dirty limit, %false if the limit is exceeded. + */ +bool zone_dirty_ok(struct zone *zone) +{ + unsigned long limit = zone_dirty_limit(zone); + + return zone_page_state(zone, NR_FILE_DIRTY) + + zone_page_state(zone, NR_UNSTABLE_NFS) + + zone_page_state(zone, NR_WRITEBACK) <= limit; +} + /* * couple the period to the dirty_ratio: * -- cgit v1.1