summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/migrate.h6
-rw-r--r--kernel/sched/fair.c9
-rw-r--r--mm/migrate.c20
3 files changed, 35 insertions, 0 deletions
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index f0d0313..9155688 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -77,11 +77,17 @@ static inline int migrate_huge_page_move_mapping(struct address_space *mapping,
#ifdef CONFIG_NUMA_BALANCING
extern int migrate_misplaced_page(struct page *page, int node);
+extern int migrate_misplaced_page(struct page *page, int node);
+extern bool migrate_ratelimited(int node);
#else
static inline int migrate_misplaced_page(struct page *page, int node)
{
return -EAGAIN; /* can't migrate now */
}
+static inline bool migrate_ratelimited(int node)
+{
+ return false;
+}
#endif /* CONFIG_NUMA_BALANCING */
#endif /* _LINUX_MIGRATE_H */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7727b01..37e895a9 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -27,6 +27,7 @@
#include <linux/profile.h>
#include <linux/interrupt.h>
#include <linux/mempolicy.h>
+#include <linux/migrate.h>
#include <linux/task_work.h>
#include <trace/events/sched.h>
@@ -861,6 +862,14 @@ void task_numa_work(struct callback_head *work)
if (cmpxchg(&mm->numa_next_scan, migrate, next_scan) != migrate)
return;
+ /*
+ * Do not set pte_numa if the current running node is rate-limited.
+ * This loses statistics on the fault but if we are unwilling to
+ * migrate to this node, it is less likely we can do useful work
+ */
+ if (migrate_ratelimited(numa_node_id()))
+ return;
+
start = mm->numa_scan_offset;
pages = sysctl_numa_balancing_scan_size;
pages <<= 20 - PAGE_SHIFT; /* MB in pages */
diff --git a/mm/migrate.c b/mm/migrate.c
index 4b8267f..32a1afc 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1464,10 +1464,30 @@ static struct page *alloc_misplaced_dst_page(struct page *page,
* page migration rate limiting control.
* Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
* window of time. Default here says do not migrate more than 1280M per second.
+ * If a node is rate-limited then PTE NUMA updates are also rate-limited. However
+ * as it is faults that reset the window, pte updates will happen unconditionally
+ * if there has not been a fault since @pteupdate_interval_millisecs after the
+ * throttle window closed.
*/
static unsigned int migrate_interval_millisecs __read_mostly = 100;
+static unsigned int pteupdate_interval_millisecs __read_mostly = 1000;
static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
+/* Returns true if NUMA migration is currently rate limited */
+bool migrate_ratelimited(int node)
+{
+ pg_data_t *pgdat = NODE_DATA(node);
+
+ if (time_after(jiffies, pgdat->numabalancing_migrate_next_window +
+ msecs_to_jiffies(pteupdate_interval_millisecs)))
+ return false;
+
+ if (pgdat->numabalancing_migrate_nr_pages < ratelimit_pages)
+ return false;
+
+ return true;
+}
+
/*
* Attempt to migrate a misplaced page to the specified destination
* node. Caller is expected to have an elevated reference count on
OpenPOWER on IntegriCloud