diff options
-rw-r--r-- | include/linux/migrate.h | 6 | ||||
-rw-r--r-- | kernel/sched/fair.c | 9 | ||||
-rw-r--r-- | mm/migrate.c | 20 |
3 files changed, 35 insertions, 0 deletions
diff --git a/include/linux/migrate.h b/include/linux/migrate.h index f0d0313..9155688 100644 --- a/include/linux/migrate.h +++ b/include/linux/migrate.h @@ -77,11 +77,17 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping, #ifdef CONFIG_NUMA_BALANCING extern int migrate_misplaced_page(struct page *page, int node); +extern int migrate_misplaced_page(struct page *page, int node); +extern bool migrate_ratelimited(int node); #else static inline int migrate_misplaced_page(struct page *page, int node) { return -EAGAIN; /* can't migrate now */ } +static inline bool migrate_ratelimited(int node) +{ + return false; +} #endif /* CONFIG_NUMA_BALANCING */ #endif /* _LINUX_MIGRATE_H */ diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7727b01..37e895a9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -27,6 +27,7 @@ #include <linux/profile.h> #include <linux/interrupt.h> #include <linux/mempolicy.h> +#include <linux/migrate.h> #include <linux/task_work.h> #include <trace/events/sched.h> @@ -861,6 +862,14 @@ void task_numa_work(struct callback_head *work) if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate) return; + /* + * Do not set pte_numa if the current running node is rate-limited. + * This loses statistics on the fault but if we are unwilling to + * migrate to this node, it is less likely we can do useful work + */ + if (migrate_ratelimited(numa_node_id())) + return; + start = mm->numa_scan_offset; pages = sysctl_numa_balancing_scan_size; pages <<= 20 - PAGE_SHIFT; /* MB in pages */ diff --git a/mm/migrate.c b/mm/migrate.c index 4b8267f..32a1afc 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1464,10 +1464,30 @@ static struct page *alloc_misplaced_dst_page(struct page *page, * page migration rate limiting control. * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs * window of time. Default here says do not migrate more than 1280M per second. + * If a node is rate-limited then PTE NUMA updates are also rate-limited. However + * as it is faults that reset the window, pte updates will happen unconditionally + * if there has not been a fault since @pteupdate_interval_millisecs after the + * throttle window closed. */ static unsigned int migrate_interval_millisecs __read_mostly = 100; +static unsigned int pteupdate_interval_millisecs __read_mostly = 1000; static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT); +/* Returns true if NUMA migration is currently rate limited */ +bool migrate_ratelimited(int node) +{ + pg_data_t *pgdat = NODE_DATA(node); + + if (time_after(jiffies, pgdat->numabalancing_migrate_next_window + + msecs_to_jiffies(pteupdate_interval_millisecs))) + return false; + + if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages) + return false; + + return true; +} + /* * Attempt to migrate a misplaced page to the specified destination * node. Caller is expected to have an elevated reference count on |