summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched.h9
-rw-r--r--kernel/sched/fair.c112
-rw-r--r--mm/huge_memory.c4
-rw-r--r--mm/memory.c9
4 files changed, 105 insertions, 29 deletions
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 59f953b..2292f6c 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1365,6 +1365,14 @@ struct task_struct {
*/
unsigned long *numa_faults_buffer;
+ /*
+ * numa_faults_locality tracks if faults recorded during the last
+ * scan window were remote/local. The task scan period is adapted
+ * based on the locality of the faults with different weights
+ * depending on whether they were shared or private faults
+ */
+ unsigned long numa_faults_locality[2];
+
int numa_preferred_nid;
unsigned long numa_pages_migrated;
#endif /* CONFIG_NUMA_BALANCING */
@@ -1455,6 +1463,7 @@ struct task_struct {
#define TNF_MIGRATED 0x01
#define TNF_NO_GROUP 0x02
#define TNF_SHARED 0x04
+#define TNF_FAULT_LOCAL 0x08
#ifdef CONFIG_NUMA_BALANCING
extern void task_numa_fault(int last_node, int node, int pages, int flags);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d26a16e..66237ff 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1241,6 +1241,12 @@ static int task_numa_migrate(struct task_struct *p)
sched_setnuma(p, env.dst_nid);
+ /*
+ * Reset the scan period if the task is being rescheduled on an
+ * alternative node to recheck if the tasks is now properly placed.
+ */
+ p->numa_scan_period = task_scan_min(p);
+
if (env.best_task == NULL) {
int ret = migrate_task_to(p, env.best_cpu);
return ret;
@@ -1276,10 +1282,86 @@ static void numa_migrate_preferred(struct task_struct *p)
p->numa_migrate_retry = jiffies + HZ*5;
}
+/*
+ * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
+ * increments. The more local the fault statistics are, the higher the scan
+ * period will be for the next scan window. If local/remote ratio is below
+ * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
+ * scan period will decrease
+ */
+#define NUMA_PERIOD_SLOTS 10
+#define NUMA_PERIOD_THRESHOLD 3
+
+/*
+ * Increase the scan period (slow down scanning) if the majority of
+ * our memory is already on our local node, or if the majority of
+ * the page accesses are shared with other processes.
+ * Otherwise, decrease the scan period.
+ */
+static void update_task_scan_period(struct task_struct *p,
+ unsigned long shared, unsigned long private)
+{
+ unsigned int period_slot;
+ int ratio;
+ int diff;
+
+ unsigned long remote = p->numa_faults_locality[0];
+ unsigned long local = p->numa_faults_locality[1];
+
+ /*
+ * If there were no record hinting faults then either the task is
+ * completely idle or all activity is areas that are not of interest
+ * to automatic numa balancing. Scan slower
+ */
+ if (local + shared == 0) {
+ p->numa_scan_period = min(p->numa_scan_period_max,
+ p->numa_scan_period << 1);
+
+ p->mm->numa_next_scan = jiffies +
+ msecs_to_jiffies(p->numa_scan_period);
+
+ return;
+ }
+
+ /*
+ * Prepare to scale scan period relative to the current period.
+ * == NUMA_PERIOD_THRESHOLD scan period stays the same
+ * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
+ * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
+ */
+ period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
+ ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+ if (ratio >= NUMA_PERIOD_THRESHOLD) {
+ int slot = ratio - NUMA_PERIOD_THRESHOLD;
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else {
+ diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+
+ /*
+ * Scale scan rate increases based on sharing. There is an
+ * inverse relationship between the degree of sharing and
+ * the adjustment made to the scanning period. Broadly
+ * speaking the intent is that there is little point
+ * scanning faster if shared accesses dominate as it may
+ * simply bounce migrations uselessly
+ */
+ period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
+ ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+ diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+ }
+
+ p->numa_scan_period = clamp(p->numa_scan_period + diff,
+ task_scan_min(p), task_scan_max(p));
+ memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+}
+
static void task_numa_placement(struct task_struct *p)
{
int seq, nid, max_nid = -1, max_group_nid = -1;
unsigned long max_faults = 0, max_group_faults = 0;
+ unsigned long fault_types[2] = { 0, 0 };
spinlock_t *group_lock = NULL;
seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1309,6 +1391,7 @@ static void task_numa_placement(struct task_struct *p)
/* Decay existing window, copy faults since last scan */
p->numa_faults[i] >>= 1;
p->numa_faults[i] += p->numa_faults_buffer[i];
+ fault_types[priv] += p->numa_faults_buffer[i];
p->numa_faults_buffer[i] = 0;
faults += p->numa_faults[i];
@@ -1333,6 +1416,8 @@ static void task_numa_placement(struct task_struct *p)
}
}
+ update_task_scan_period(p, fault_types[0], fault_types[1]);
+
if (p->numa_group) {
/*
* If the preferred task and group nids are different,
@@ -1538,6 +1623,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
BUG_ON(p->numa_faults_buffer);
p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
p->total_numa_faults = 0;
+ memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
/*
@@ -1552,19 +1638,6 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
task_numa_group(p, last_cpupid, flags, &priv);
}
- /*
- * If pages are properly placed (did not migrate) then scan slower.
- * This is reset periodically in case of phase changes
- */
- if (!migrated) {
- /* Initialise if necessary */
- if (!p->numa_scan_period_max)
- p->numa_scan_period_max = task_scan_max(p);
-
- p->numa_scan_period = min(p->numa_scan_period_max,
- p->numa_scan_period + 10);
- }
-
task_numa_placement(p);
/* Retry task to preferred node migration if it previously failed */
@@ -1575,6 +1648,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
p->numa_pages_migrated += pages;
p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
+ p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
}
static void reset_ptenuma_scan(struct task_struct *p)
@@ -1702,18 +1776,6 @@ void task_numa_work(struct callback_head *work)
out:
/*
- * If the whole process was scanned without updates then no NUMA
- * hinting faults are being recorded and scan rate should be lower.
- */
- if (mm->numa_scan_offset == 0 && !nr_pte_updates) {
- p->numa_scan_period = min(p->numa_scan_period_max,
- p->numa_scan_period << 1);
-
- next_scan = now + msecs_to_jiffies(p->numa_scan_period);
- mm->numa_next_scan = next_scan;
- }
-
- /*
* It is possible to reach the end of the VMA list but the last few
* VMAs are not guaranteed to the vma_migratable. If they are not, we
* would find the !migratable VMA on the next scan but not reset the
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 7ab4e32..1be2a1f 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1296,8 +1296,10 @@ int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
page_nid = page_to_nid(page);
last_cpupid = page_cpupid_last(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
- if (page_nid == this_nid)
+ if (page_nid == this_nid) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+ flags |= TNF_FAULT_LOCAL;
+ }
/*
* Avoid grouping on DSO/COW pages in specific and RO pages
diff --git a/mm/memory.c b/mm/memory.c
index 823720c..1c7501f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3527,13 +3527,16 @@ static int do_nonlinear_fault(struct mm_struct *mm, struct vm_area_struct *vma,
}
int numa_migrate_prep(struct page *page, struct vm_area_struct *vma,
- unsigned long addr, int page_nid)
+ unsigned long addr, int page_nid,
+ int *flags)
{
get_page(page);
count_vm_numa_event(NUMA_HINT_FAULTS);
- if (page_nid == numa_node_id())
+ if (page_nid == numa_node_id()) {
count_vm_numa_event(NUMA_HINT_FAULTS_LOCAL);
+ *flags |= TNF_FAULT_LOCAL;
+ }
return mpol_misplaced(page, vma, addr);
}
@@ -3593,7 +3596,7 @@ int do_numa_page(struct mm_struct *mm, struct vm_area_struct *vma,
last_cpupid = page_cpupid_last(page);
page_nid = page_to_nid(page);
- target_nid = numa_migrate_prep(page, vma, addr, page_nid);
+ target_nid = numa_migrate_prep(page, vma, addr, page_nid, &flags);
pte_unmap_unlock(ptep, ptl);
if (target_nid == -1) {
put_page(page);
OpenPOWER on IntegriCloud