summaryrefslogtreecommitdiffstats
path: root/kernel/sched/fair.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r--kernel/sched/fair.c112
1 files changed, 87 insertions, 25 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index d26a16e..66237ff 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1241,6 +1241,12 @@ static int task_numa_migrate(struct task_struct *p)
sched_setnuma(p, env.dst_nid);
+ /*
+ * Reset the scan period if the task is being rescheduled on an
+ * alternative node to recheck if the tasks is now properly placed.
+ */
+ p->numa_scan_period = task_scan_min(p);
+
if (env.best_task == NULL) {
int ret = migrate_task_to(p, env.best_cpu);
return ret;
@@ -1276,10 +1282,86 @@ static void numa_migrate_preferred(struct task_struct *p)
p->numa_migrate_retry = jiffies + HZ*5;
}
+/*
+ * When adapting the scan rate, the period is divided into NUMA_PERIOD_SLOTS
+ * increments. The more local the fault statistics are, the higher the scan
+ * period will be for the next scan window. If local/remote ratio is below
+ * NUMA_PERIOD_THRESHOLD (where range of ratio is 1..NUMA_PERIOD_SLOTS) the
+ * scan period will decrease
+ */
+#define NUMA_PERIOD_SLOTS 10
+#define NUMA_PERIOD_THRESHOLD 3
+
+/*
+ * Increase the scan period (slow down scanning) if the majority of
+ * our memory is already on our local node, or if the majority of
+ * the page accesses are shared with other processes.
+ * Otherwise, decrease the scan period.
+ */
+static void update_task_scan_period(struct task_struct *p,
+ unsigned long shared, unsigned long private)
+{
+ unsigned int period_slot;
+ int ratio;
+ int diff;
+
+ unsigned long remote = p->numa_faults_locality[0];
+ unsigned long local = p->numa_faults_locality[1];
+
+ /*
+ * If there were no record hinting faults then either the task is
+ * completely idle or all activity is areas that are not of interest
+ * to automatic numa balancing. Scan slower
+ */
+ if (local + shared == 0) {
+ p->numa_scan_period = min(p->numa_scan_period_max,
+ p->numa_scan_period << 1);
+
+ p->mm->numa_next_scan = jiffies +
+ msecs_to_jiffies(p->numa_scan_period);
+
+ return;
+ }
+
+ /*
+ * Prepare to scale scan period relative to the current period.
+ * == NUMA_PERIOD_THRESHOLD scan period stays the same
+ * < NUMA_PERIOD_THRESHOLD scan period decreases (scan faster)
+ * >= NUMA_PERIOD_THRESHOLD scan period increases (scan slower)
+ */
+ period_slot = DIV_ROUND_UP(p->numa_scan_period, NUMA_PERIOD_SLOTS);
+ ratio = (local * NUMA_PERIOD_SLOTS) / (local + remote);
+ if (ratio >= NUMA_PERIOD_THRESHOLD) {
+ int slot = ratio - NUMA_PERIOD_THRESHOLD;
+ if (!slot)
+ slot = 1;
+ diff = slot * period_slot;
+ } else {
+ diff = -(NUMA_PERIOD_THRESHOLD - ratio) * period_slot;
+
+ /*
+ * Scale scan rate increases based on sharing. There is an
+ * inverse relationship between the degree of sharing and
+ * the adjustment made to the scanning period. Broadly
+ * speaking the intent is that there is little point
+ * scanning faster if shared accesses dominate as it may
+ * simply bounce migrations uselessly
+ */
+ period_slot = DIV_ROUND_UP(diff, NUMA_PERIOD_SLOTS);
+ ratio = DIV_ROUND_UP(private * NUMA_PERIOD_SLOTS, (private + shared));
+ diff = (diff * ratio) / NUMA_PERIOD_SLOTS;
+ }
+
+ p->numa_scan_period = clamp(p->numa_scan_period + diff,
+ task_scan_min(p), task_scan_max(p));
+ memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
+}
+
static void task_numa_placement(struct task_struct *p)
{
int seq, nid, max_nid = -1, max_group_nid = -1;
unsigned long max_faults = 0, max_group_faults = 0;
+ unsigned long fault_types[2] = { 0, 0 };
spinlock_t *group_lock = NULL;
seq = ACCESS_ONCE(p->mm->numa_scan_seq);
@@ -1309,6 +1391,7 @@ static void task_numa_placement(struct task_struct *p)
/* Decay existing window, copy faults since last scan */
p->numa_faults[i] >>= 1;
p->numa_faults[i] += p->numa_faults_buffer[i];
+ fault_types[priv] += p->numa_faults_buffer[i];
p->numa_faults_buffer[i] = 0;
faults += p->numa_faults[i];
@@ -1333,6 +1416,8 @@ static void task_numa_placement(struct task_struct *p)
}
}
+ update_task_scan_period(p, fault_types[0], fault_types[1]);
+
if (p->numa_group) {
/*
* If the preferred task and group nids are different,
@@ -1538,6 +1623,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
BUG_ON(p->numa_faults_buffer);
p->numa_faults_buffer = p->numa_faults + (2 * nr_node_ids);
p->total_numa_faults = 0;
+ memset(p->numa_faults_locality, 0, sizeof(p->numa_faults_locality));
}
/*
@@ -1552,19 +1638,6 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
task_numa_group(p, last_cpupid, flags, &priv);
}
- /*
- * If pages are properly placed (did not migrate) then scan slower.
- * This is reset periodically in case of phase changes
- */
- if (!migrated) {
- /* Initialise if necessary */
- if (!p->numa_scan_period_max)
- p->numa_scan_period_max = task_scan_max(p);
-
- p->numa_scan_period = min(p->numa_scan_period_max,
- p->numa_scan_period + 10);
- }
-
task_numa_placement(p);
/* Retry task to preferred node migration if it previously failed */
@@ -1575,6 +1648,7 @@ void task_numa_fault(int last_cpupid, int node, int pages, int flags)
p->numa_pages_migrated += pages;
p->numa_faults_buffer[task_faults_idx(node, priv)] += pages;
+ p->numa_faults_locality[!!(flags & TNF_FAULT_LOCAL)] += pages;
}
static void reset_ptenuma_scan(struct task_struct *p)
@@ -1702,18 +1776,6 @@ void task_numa_work(struct callback_head *work)
out:
/*
- * If the whole process was scanned without updates then no NUMA
- * hinting faults are being recorded and scan rate should be lower.
- */
- if (mm->numa_scan_offset == 0 && !nr_pte_updates) {
- p->numa_scan_period = min(p->numa_scan_period_max,
- p->numa_scan_period << 1);
-
- next_scan = now + msecs_to_jiffies(p->numa_scan_period);
- mm->numa_next_scan = next_scan;
- }
-
- /*
* It is possible to reach the end of the VMA list but the last few
* VMAs are not guaranteed to the vma_migratable. If they are not, we
* would find the !migratable VMA on the next scan but not reset the
OpenPOWER on IntegriCloud