From ec65993443736a5091b68e80ff1734548944a4b8 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 14:33:16 -0800 Subject: mm, x86: Account for TLB flushes only when debugging Bisection between 3.11 and 3.12 fingered commit 9824cf97 ("mm: vmstats: tlb flush counters") to cause overhead problems. The counters are undeniably useful but how often do we really need to debug TLB flush related issues? It does not justify taking the penalty everywhere so make it a debugging option. Signed-off-by: Mel Gorman Tested-by: Davidlohr Bueso Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Cc: Hugh Dickins Cc: Alex Shi Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-XzxjntugxuwpxXhcrxqqh53b@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index ae699b3..05446c1 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -103,7 +103,7 @@ static void flush_tlb_func(void *info) if (f->flush_mm != this_cpu_read(cpu_tlbstate.active_mm)) return; - count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) { if (f->flush_end == TLB_FLUSH_ALL) local_flush_tlb(); @@ -131,7 +131,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, info.flush_start = start; info.flush_end = end; - count_vm_event(NR_TLB_REMOTE_FLUSH); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); if (is_uv_system()) { unsigned int cpu; @@ -151,7 +151,7 @@ void flush_tlb_current_task(void) preempt_disable(); - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); if (cpumask_any_but(mm_cpumask(mm), smp_processor_id()) < nr_cpu_ids) flush_tlb_others(mm_cpumask(mm), mm, 0UL, TLB_FLUSH_ALL); @@ -215,7 +215,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, /* tlb_flushall_shift is on balance point, details in commit log */ if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ALL); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); } else { if (has_large_page(mm, start, end)) { @@ -224,7 +224,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, } /* flush range by one by one 'invlpg' */ for (addr = start; addr < end; addr += PAGE_SIZE) { - count_vm_event(NR_TLB_LOCAL_FLUSH_ONE); + count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); } @@ -262,7 +262,7 @@ void flush_tlb_page(struct vm_area_struct *vma, unsigned long start) static void do_flush_tlb_all(void *info) { - count_vm_event(NR_TLB_REMOTE_FLUSH_RECEIVED); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); __flush_tlb_all(); if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_LAZY) leave_mm(smp_processor_id()); @@ -270,7 +270,7 @@ static void do_flush_tlb_all(void *info) void flush_tlb_all(void) { - count_vm_event(NR_TLB_REMOTE_FLUSH); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); on_each_cpu(do_flush_tlb_all, NULL, 1); } -- cgit v1.1 From 15aa368255f249df0b2af630c9487bb5471bd7da Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 14:33:18 -0800 Subject: x86/mm: Clean up inconsistencies when flushing TLB ranges NR_TLB_LOCAL_FLUSH_ALL is not always accounted for correctly and the comparison with total_vm is done before taking tlb_flushall_shift into account. Clean it up. Signed-off-by: Mel Gorman Tested-by: Davidlohr Bueso Reviewed-by: Alex Shi Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Hugh Dickins Link: http://lkml.kernel.org/n/tip-Iz5gcahrgskIldvukulzi0hh@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 05446c1..5176526 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -189,6 +189,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, { unsigned long addr; unsigned act_entries, tlb_entries = 0; + unsigned long nr_base_pages; preempt_disable(); if (current->active_mm != mm) @@ -210,18 +211,17 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, tlb_entries = tlb_lli_4k[ENTRIES]; else tlb_entries = tlb_lld_4k[ENTRIES]; + /* Assume all of TLB entries was occupied by this task */ - act_entries = mm->total_vm > tlb_entries ? tlb_entries : mm->total_vm; + act_entries = tlb_entries >> tlb_flushall_shift; + act_entries = mm->total_vm > act_entries ? act_entries : mm->total_vm; + nr_base_pages = (end - start) >> PAGE_SHIFT; /* tlb_flushall_shift is on balance point, details in commit log */ - if ((end - start) >> PAGE_SHIFT > act_entries >> tlb_flushall_shift) { + if (nr_base_pages > act_entries || has_large_page(mm, start, end)) { count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); } else { - if (has_large_page(mm, start, end)) { - local_flush_tlb(); - goto flush_all; - } /* flush range by one by one 'invlpg' */ for (addr = start; addr < end; addr += PAGE_SIZE) { count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); -- cgit v1.1 From 71b54f8263860a37dd9f50f81880a9d681fd9c10 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 21 Jan 2014 14:33:19 -0800 Subject: x86/mm: Eliminate redundant page table walk during TLB range flushing When choosing between doing an address space or ranged flush, the x86 implementation of flush_tlb_mm_range takes into account whether there are any large pages in the range. A per-page flush typically requires fewer entries than would covered by a single large page and the check is redundant. There is one potential exception. THP migration flushes single THP entries and it conceivably would benefit from flushing a single entry instead of the mm. However, this flush is after a THP allocation, copy and page table update potentially with any other threads serialised behind it. In comparison to that, the flush is noise. It makes more sense to optimise balancing to require fewer flushes than to optimise the flush itself. This patch deletes the redundant huge page check. Signed-off-by: Mel Gorman Tested-by: Davidlohr Bueso Reviewed-by: Rik van Riel Signed-off-by: Andrew Morton Cc: Alex Shi Cc: Linus Torvalds Cc: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-sgei1drpOcburujPsfh6ovmo@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 28 +--------------------------- 1 file changed, 1 insertion(+), 27 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 5176526..dd8dda1 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -158,32 +158,6 @@ void flush_tlb_current_task(void) preempt_enable(); } -/* - * It can find out the THP large page, or - * HUGETLB page in tlb_flush when THP disabled - */ -static inline unsigned long has_large_page(struct mm_struct *mm, - unsigned long start, unsigned long end) -{ - pgd_t *pgd; - pud_t *pud; - pmd_t *pmd; - unsigned long addr = ALIGN(start, HPAGE_SIZE); - for (; addr < end; addr += HPAGE_SIZE) { - pgd = pgd_offset(mm, addr); - if (likely(!pgd_none(*pgd))) { - pud = pud_offset(pgd, addr); - if (likely(!pud_none(*pud))) { - pmd = pmd_offset(pud, addr); - if (likely(!pmd_none(*pmd))) - if (pmd_large(*pmd)) - return addr; - } - } - } - return 0; -} - void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned long vmflag) { @@ -218,7 +192,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, nr_base_pages = (end - start) >> PAGE_SHIFT; /* tlb_flushall_shift is on balance point, details in commit log */ - if (nr_base_pages > act_entries || has_large_page(mm, start, end)) { + if (nr_base_pages > act_entries) { count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ALL); local_flush_tlb(); } else { -- cgit v1.1 From a85eba8814631d0d48361c8b9a7ee0984e80c03c Mon Sep 17 00:00:00 2001 From: Toshi Kani Date: Tue, 21 Jan 2014 14:33:15 -0800 Subject: arch/x86/mm/srat: Skip NUMA_NO_NODE while parsing SLIT When ACPI SLIT table has an I/O locality (i.e. a locality unique to an I/O device), numa_set_distance() emits this warning message: NUMA: Warning: node ids are out of bound, from=-1 to=-1 distance=10 acpi_numa_slit_init() calls numa_set_distance() with pxm_to_node(), which assumes that all localities have been parsed with SRAT previously. SRAT does not list I/O localities, where as SLIT lists all localities including I/Os. Hence, pxm_to_node() returns NUMA_NO_NODE (-1) for an I/O locality. I/O localities are not supported and are ignored today, but emitting such warning message leads to unnecessary confusion. Change acpi_numa_slit_init() to avoid calling numa_set_distance() with NUMA_NO_NODE. Signed-off-by: Toshi Kani Acked-by: David Rientjes Signed-off-by: Andrew Morton Cc: Yinghai Lu Link: http://lkml.kernel.org/n/tip-dSvpjjvp8aMzs1ybkftxohlh@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/mm/srat.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 266ca91..5ecf651 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -42,15 +42,25 @@ static __init inline int srat_disabled(void) return acpi_numa < 0; } -/* Callback for SLIT parsing */ +/* + * Callback for SLIT parsing. pxm_to_node() returns NUMA_NO_NODE for + * I/O localities since SRAT does not list them. I/O localities are + * not supported at this point. + */ void __init acpi_numa_slit_init(struct acpi_table_slit *slit) { int i, j; - for (i = 0; i < slit->locality_count; i++) - for (j = 0; j < slit->locality_count; j++) + for (i = 0; i < slit->locality_count; i++) { + if (pxm_to_node(i) == NUMA_NO_NODE) + continue; + for (j = 0; j < slit->locality_count; j++) { + if (pxm_to_node(j) == NUMA_NO_NODE) + continue; numa_set_distance(pxm_to_node(i), pxm_to_node(j), slit->entry[slit->locality_count * i + j]); + } + } } /* Callback for Proximity Domain -> x2APIC mapping */ -- cgit v1.1 From 85fc73a2cdf10cf42bc36fb3bca3896b2095a1c2 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Sat, 1 Feb 2014 13:30:19 +0100 Subject: x86: Fix the initialization of physnode_map With DISCONTIGMEM, the mapping between a pfn and its owning node is initialized using data provided by the BIOS. However, the initialization may fail if the extents are not aligned to section boundary (64M). The symptom of this bug is an early boot failure in pfn_to_page(), as it tries to access NODE_DATA(__nid) using index from an unitialized element of the physnode_map[] array. While the bug is always present, it is more likely to be hit in kdump kernels on large machines, because: 1. The memory map for a kdump kernel is specified as exactmap, and exactmap is more likely to be unaligned. 2. Large reservations are more likely to span across a 64M boundary. [ hpa: fixed incorrect use of "pfn" instead of "start" ] Signed-off-by: Petr Tesarik Link: http://lkml.kernel.org/r/20140201133019.32e56f86@hananiah.suse.cz Acked-by: David Rientjes Signed-off-by: H. Peter Anvin --- arch/x86/mm/numa_32.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa_32.c b/arch/x86/mm/numa_32.c index 0342d27..47b6436 100644 --- a/arch/x86/mm/numa_32.c +++ b/arch/x86/mm/numa_32.c @@ -52,6 +52,8 @@ void memory_present(int nid, unsigned long start, unsigned long end) nid, start, end); printk(KERN_DEBUG " Setting physnode_map array to node %d for pfns:\n", nid); printk(KERN_DEBUG " "); + start = round_down(start, PAGES_PER_SECTION); + end = round_up(end, PAGES_PER_SECTION); for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) { physnode_map[pfn / PAGES_PER_SECTION] = nid; printk(KERN_CONT "%lx ", pfn); -- cgit v1.1 From 017c217a26e9bf6948482f751b30d0507e30a7d0 Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Thu, 6 Feb 2014 12:04:25 -0800 Subject: arch/x86/mm/numa.c: initialize numa_kernel_nodes in numa_clear_kernel_node_hotplug() On-stack variable numa_kernel_nodes in numa_clear_kernel_node_hotplug() was not initialized. So we need to initialize it. [akpm@linux-foundation.org: use NODE_MASK_NONE, per David] Signed-off-by: Tang Chen Tested-by: Gu Zheng Reported-by: Dave Jones Reported-by: David Rientjes Tested-by: Dave Jones Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/numa.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 81b2750..45ec9d7 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -565,7 +565,7 @@ static void __init numa_init_array(void) static void __init numa_clear_kernel_node_hotplug(void) { int i, nid; - nodemask_t numa_kernel_nodes; + nodemask_t numa_kernel_nodes = NODE_MASK_NONE; unsigned long start, end; struct memblock_type *type = &memblock.reserved; -- cgit v1.1 From 7bc35fdde6724549a0239b71e08b9f33d8bf2bfb Mon Sep 17 00:00:00 2001 From: Tang Chen Date: Thu, 6 Feb 2014 12:04:27 -0800 Subject: arch/x86/mm/numa.c: fix array index overflow when synchronizing nid to memblock.reserved. The following path will cause array out of bound. memblock_add_region() will always set nid in memblock.reserved to MAX_NUMNODES. In numa_register_memblks(), after we set all nid to correct valus in memblock.reserved, we called setup_node_data(), and used memblock_alloc_nid() to allocate memory, with nid set to MAX_NUMNODES. The nodemask_t type can be seen as a bit array. And the index is 0 ~ MAX_NUMNODES-1. After that, when we call node_set() in numa_clear_kernel_node_hotplug(), the nodemask_t got an index of value MAX_NUMNODES, which is out of [0 ~ MAX_NUMNODES-1]. See below: numa_init() |---> numa_register_memblks() | |---> memblock_set_node(memory) set correct nid in memblock.memory | |---> memblock_set_node(reserved) set correct nid in memblock.reserved | |...... | |---> setup_node_data() | |---> memblock_alloc_nid() here, nid is set to MAX_NUMNODES (1024) |...... |---> numa_clear_kernel_node_hotplug() |---> node_set() here, we have an index 1024, and overflowed This patch moves nid setting to numa_clear_kernel_node_hotplug() to fix this problem. Reported-by: Dave Jones Signed-off-by: Tang Chen Tested-by: Gu Zheng Reported-by: Dave Jones Cc: David Rientjes Tested-by: Dave Jones Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/numa.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 45ec9d7..27aa0455 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -493,14 +493,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi) struct numa_memblk *mb = &mi->blk[i]; memblock_set_node(mb->start, mb->end - mb->start, &memblock.memory, mb->nid); - - /* - * At this time, all memory regions reserved by memblock are - * used by the kernel. Set the nid in memblock.reserved will - * mark out all the nodes the kernel resides in. - */ - memblock_set_node(mb->start, mb->end - mb->start, - &memblock.reserved, mb->nid); } /* @@ -569,6 +561,17 @@ static void __init numa_clear_kernel_node_hotplug(void) unsigned long start, end; struct memblock_type *type = &memblock.reserved; + /* + * At this time, all memory regions reserved by memblock are + * used by the kernel. Set the nid in memblock.reserved will + * mark out all the nodes the kernel resides in. + */ + for (i = 0; i < numa_meminfo.nr_blks; i++) { + struct numa_memblk *mb = &numa_meminfo.blk[i]; + memblock_set_node(mb->start, mb->end - mb->start, + &memblock.reserved, mb->nid); + } + /* Mark all kernel nodes. */ for (i = 0; i < type->cnt; i++) node_set(type->regions[i].nid, numa_kernel_nodes); -- cgit v1.1 From 4640c7ee9b8953237d05a61ea3ea93981d1bc961 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 13 Feb 2014 07:46:04 -0800 Subject: x86, smap: smap_violation() is bogus if CONFIG_X86_SMAP is off If CONFIG_X86_SMAP is disabled, smap_violation() tests for conditions which are incorrect (as the AC flag doesn't matter), causing spurious faults. The dynamic disabling of SMAP (nosmap on the command line) is fine because it disables X86_FEATURE_SMAP, therefore causing the static_cpu_has() to return false. Found by Fengguang Wu's test system. [ v3: move all predicates into smap_violation() ] [ v2: use IS_ENABLED() instead of #ifdef ] Reported-by: Fengguang Wu Link: http://lkml.kernel.org/r/20140213124550.GA30497@localhost Signed-off-by: H. Peter Anvin Cc: # v3.7+ --- arch/x86/mm/fault.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 9d591c8..6dea040 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1001,6 +1001,12 @@ static int fault_in_kernel_space(unsigned long address) static inline bool smap_violation(int error_code, struct pt_regs *regs) { + if (!IS_ENABLED(CONFIG_X86_SMAP)) + return false; + + if (!static_cpu_has(X86_FEATURE_SMAP)) + return false; + if (error_code & PF_USER) return false; @@ -1087,11 +1093,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); - if (static_cpu_has(X86_FEATURE_SMAP)) { - if (unlikely(smap_violation(error_code, regs))) { - bad_area_nosemaphore(regs, error_code, address); - return; - } + if (unlikely(smap_violation(error_code, regs))) { + bad_area_nosemaphore(regs, error_code, address); + return; } /* -- cgit v1.1