From a54a407fbf7735fd8f7841375574f5d9b0375f93 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:28:46 +0100 Subject: mm: Close races between THP migration and PMD numa clearing THP migration uses the page lock to guard against parallel allocations but there are cases like this still open Task A Task B --------------------- --------------------- do_huge_pmd_numa_page do_huge_pmd_numa_page lock_page mpol_misplaced == -1 unlock_page goto clear_pmdnuma lock_page mpol_misplaced == 2 migrate_misplaced_transhuge pmd = pmd_mknonnuma set_pmd_at During hours of testing, one crashed with weird errors and while I have no direct evidence, I suspect something like the race above happened. This patch extends the page lock to being held until the pmd_numa is cleared to prevent migration starting in parallel while the pmd_numa is being cleared. It also flushes the old pmd entry and orders pagetable insertion before rmap insertion. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-9-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/migrate.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index a26bccd..7bd90d3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1713,12 +1713,12 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, unlock_page(new_page); put_page(new_page); /* Free it */ - unlock_page(page); + /* Retake the callers reference and putback on LRU */ + get_page(page); putback_lru_page(page); - - count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); - isolated = 0; - goto out; + mod_zone_page_state(page_zone(page), + NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); + goto out_fail; } /* @@ -1735,9 +1735,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); entry = pmd_mkhuge(entry); - page_add_new_anon_rmap(new_page, vma, haddr); - + pmdp_clear_flush(vma, haddr, pmd); set_pmd_at(mm, haddr, pmd, entry); + page_add_new_anon_rmap(new_page, vma, haddr); update_mmu_cache_pmd(vma, address, &entry); page_remove_rmap(page); /* @@ -1756,7 +1756,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, count_vm_events(PGMIGRATE_SUCCESS, HPAGE_PMD_NR); count_vm_numa_events(NUMA_PAGE_MIGRATE, HPAGE_PMD_NR); -out: mod_zone_page_state(page_zone(page), NR_ISOLATED_ANON + page_lru, -HPAGE_PMD_NR); @@ -1765,6 +1764,10 @@ out: out_fail: count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR); out_dropref: + entry = pmd_mknonnuma(entry); + set_pmd_at(mm, haddr, pmd, entry); + update_mmu_cache_pmd(vma, address, &entry); + unlock_page(page); put_page(page); return 0; -- cgit v1.1 From 1bc115d87dffd1c43bdc3c9c9d1e3a51c195d18e Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:05 +0100 Subject: mm: numa: Scan pages with elevated page_mapcount Currently automatic NUMA balancing is unable to distinguish between false shared versus private pages except by ignoring pages with an elevated page_mapcount entirely. This avoids shared pages bouncing between the nodes whose task is using them but that is ignored quite a lot of data. This patch kicks away the training wheels in preparation for adding support for identifying shared/private pages is now in place. The ordering is so that the impact of the shared/private detection can be easily measured. Note that the patch does not migrate shared, file-backed within vmas marked VM_EXEC as these are generally shared library pages. Migrating such pages is not beneficial as there is an expectation they are read-shared between caches and iTLB and iCache pressure is generally low. Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-28-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/migrate.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 7bd90d3..fcba2f4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1599,7 +1599,8 @@ int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) * node. Caller is expected to have an elevated reference count on * the page that will be dropped by this function before returning. */ -int migrate_misplaced_page(struct page *page, int node) +int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma, + int node) { pg_data_t *pgdat = NODE_DATA(node); int isolated; @@ -1607,10 +1608,11 @@ int migrate_misplaced_page(struct page *page, int node) LIST_HEAD(migratepages); /* - * Don't migrate pages that are mapped in multiple processes. - * TODO: Handle false sharing detection instead of this hammer + * Don't migrate file pages that are mapped in multiple processes + * with execute permissions as they are probably shared libraries. */ - if (page_mapcount(page) != 1) + if (page_mapcount(page) != 1 && page_is_file_cache(page) && + (vma->vm_flags & VM_EXEC)) goto out; /* @@ -1661,13 +1663,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, int page_lru = page_is_file_cache(page); /* - * Don't migrate pages that are mapped in multiple processes. - * TODO: Handle false sharing detection instead of this hammer - */ - if (page_mapcount(page) != 1) - goto out_dropref; - - /* * Rate-limit the amount of data that is being migrated to a node. * Optimal placement is no good if the memory bus is saturated and * all the time is being spent migrating! -- cgit v1.1 From b795854b1fa70f6aee923ae5df74ff7afeaddcaa Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Mon, 7 Oct 2013 11:29:07 +0100 Subject: sched/numa: Set preferred NUMA node based on number of private faults Ideally it would be possible to distinguish between NUMA hinting faults that are private to a task and those that are shared. If treated identically there is a risk that shared pages bounce between nodes depending on the order they are referenced by tasks. Ultimately what is desirable is that task private pages remain local to the task while shared pages are interleaved between sharing tasks running on different nodes to give good average performance. This is further complicated by THP as even applications that partition their data may not be partitioning on a huge page boundary. To start with, this patch assumes that multi-threaded or multi-process applications partition their data and that in general the private accesses are more important for cpu->memory locality in the general case. Also, no new infrastructure is required to treat private pages properly but interleaving for shared pages requires additional infrastructure. To detect private accesses the pid of the last accessing task is required but the storage requirements are a high. This patch borrows heavily from Ingo Molnar's patch "numa, mm, sched: Implement last-CPU+PID hash tracking" to encode some bits from the last accessing task in the page flags as well as the node information. Collisions will occur but it is better than just depending on the node information. Node information is then used to determine if a page needs to migrate. The PID information is used to detect private/shared accesses. The preferred NUMA node is selected based on where the maximum number of approximately private faults were measured. Shared faults are not taken into consideration for a few reasons. First, if there are many tasks sharing the page then they'll all move towards the same node. The node will be compute overloaded and then scheduled away later only to bounce back again. Alternatively the shared tasks would just bounce around nodes because the fault information is effectively noise. Either way accounting for shared faults the same as private faults can result in lower performance overall. The second reason is based on a hypothetical workload that has a small number of very important, heavily accessed private pages but a large shared array. The shared array would dominate the number of faults and be selected as a preferred node even though it's the wrong decision. The third reason is that multiple threads in a process will race each other to fault the shared page making the fault information unreliable. Signed-off-by: Mel Gorman [ Fix complication error when !NUMA_BALANCING. ] Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-30-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/migrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index fcba2f4..025d1e3 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1498,7 +1498,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, __GFP_NOWARN) & ~GFP_IOFS, 0); if (newpage) - page_nid_xchg_last(newpage, page_nid_last(page)); + page_nidpid_xchg_last(newpage, page_nidpid_last(page)); return newpage; } @@ -1675,7 +1675,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, if (!new_page) goto out_fail; - page_nid_xchg_last(new_page, page_nid_last(page)); + page_nidpid_xchg_last(new_page, page_nidpid_last(page)); isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) { -- cgit v1.1 From 90572890d202527c366aa9489b32404e88a7c020 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 7 Oct 2013 11:29:20 +0100 Subject: mm: numa: Change page last {nid,pid} into {cpu,pid} Change the per page last fault tracking to use cpu,pid instead of nid,pid. This will allow us to try and lookup the alternate task more easily. Note that even though it is the cpu that is store in the page flags that the mpol_misplaced decision is still based on the node. Signed-off-by: Peter Zijlstra Signed-off-by: Mel Gorman Reviewed-by: Rik van Riel Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Link: http://lkml.kernel.org/r/1381141781-10992-43-git-send-email-mgorman@suse.de [ Fixed build failure on 32-bit systems. ] Signed-off-by: Ingo Molnar --- mm/migrate.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index 025d1e3..ff53774 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1498,7 +1498,7 @@ static struct page *alloc_misplaced_dst_page(struct page *page, __GFP_NOWARN) & ~GFP_IOFS, 0); if (newpage) - page_nidpid_xchg_last(newpage, page_nidpid_last(page)); + page_cpupid_xchg_last(newpage, page_cpupid_last(page)); return newpage; } @@ -1675,7 +1675,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm, if (!new_page) goto out_fail; - page_nidpid_xchg_last(new_page, page_nidpid_last(page)); + page_cpupid_xchg_last(new_page, page_cpupid_last(page)); isolated = numamigrate_isolate_page(pgdat, page); if (!isolated) { -- cgit v1.1 From 7851a45cd3f6198bf542c30e27b330e8eeb3736c Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Mon, 7 Oct 2013 11:29:23 +0100 Subject: mm: numa: Copy cpupid on page migration After page migration, the new page has the nidpid unset. This makes every fault on a recently migrated page look like a first numa fault, leading to another page migration. Copying over the nidpid at page migration time should prevent erroneous migrations of recently migrated pages. Signed-off-by: Rik van Riel Signed-off-by: Mel Gorman Cc: Andrea Arcangeli Cc: Johannes Weiner Cc: Srikar Dronamraju Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1381141781-10992-46-git-send-email-mgorman@suse.de Signed-off-by: Ingo Molnar --- mm/migrate.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'mm/migrate.c') diff --git a/mm/migrate.c b/mm/migrate.c index ff53774..44c1fa9 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -443,6 +443,8 @@ int migrate_huge_page_move_mapping(struct address_space *mapping, */ void migrate_page_copy(struct page *newpage, struct page *page) { + int cpupid; + if (PageHuge(page) || PageTransHuge(page)) copy_huge_page(newpage, page); else @@ -479,6 +481,13 @@ void migrate_page_copy(struct page *newpage, struct page *page) __set_page_dirty_nobuffers(newpage); } + /* + * Copy NUMA information to the new page, to prevent over-eager + * future migrations of this same page. + */ + cpupid = page_cpupid_xchg_last(page, -1); + page_cpupid_xchg_last(newpage, cpupid); + mlock_migrate_page(newpage, page); ksm_migrate_page(newpage, page); /* -- cgit v1.1