From 801a55911432f582c8ab82c895d2821dc02b70e3 Mon Sep 17 00:00:00 2001 From: Pavel Machek Date: Fri, 2 Jan 2015 06:11:16 +0100 Subject: x86: init_mem_mapping(): use capital BIOS in comment Use capital BIOS in comment. Its cleaner, and allows diference between BIOS and BIOs. Signed-off-by: Pavel Machek Signed-off-by: Jiri Kosina --- arch/x86/mm/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 66dba36..452f904 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -582,7 +582,7 @@ void __init init_mem_mapping(void) * * * On x86, access has to be given to the first megabyte of ram because that area - * contains bios code and data regions used by X and dosemu and similar apps. + * contains BIOS code and data regions used by X and dosemu and similar apps. * Access has to be given to non-kernel-ram areas as well, these contain the PCI * mmio resources as well as potential bios/acpi data regions. */ -- cgit v1.1 From 375074cc736ab1d89a708c0a8d7baa4a70d5d476 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 24 Oct 2014 15:58:07 -0700 Subject: x86: Clean up cr4 manipulation CR4 manipulation was split, seemingly at random, between direct (write_cr4) and using a helper (set/clear_in_cr4). Unfortunately, the set_in_cr4 and clear_in_cr4 helpers also poke at the boot code, which only a small subset of users actually wanted. This patch replaces all cr4 access in functions that don't leave cr4 exactly the way they found it with new helpers cr4_set_bits, cr4_clear_bits, and cr4_set_bits_and_update_boot. Signed-off-by: Andy Lutomirski Reviewed-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Andrea Arcangeli Cc: Vince Weaver Cc: "hillf.zj" Cc: Valdis Kletnieks Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Kees Cook Cc: Linus Torvalds Link: http://lkml.kernel.org/r/495a10bdc9e67016b8fd3945700d46cfd5c12c2f.1414190806.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 079c3b6..d4eddbd 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -173,11 +173,11 @@ static void __init probe_page_size_mask(void) /* Enable PSE if available */ if (cpu_has_pse) - set_in_cr4(X86_CR4_PSE); + cr4_set_bits_and_update_boot(X86_CR4_PSE); /* Enable PGE if available */ if (cpu_has_pge) { - set_in_cr4(X86_CR4_PGE); + cr4_set_bits_and_update_boot(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } } -- cgit v1.1 From 1e02ce4cccdcb9688386e5b8d2c9fa4660b45389 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 24 Oct 2014 15:58:08 -0700 Subject: x86: Store a per-cpu shadow copy of CR4 Context switches and TLB flushes can change individual bits of CR4. CR4 reads take several cycles, so store a shadow copy of CR4 in a per-cpu variable. To avoid wasting a cache line, I added the CR4 shadow to cpu_tlbstate, which is already touched in switch_mm. The heaviest users of the cr4 shadow will be switch_mm and __switch_to_xtra, and __switch_to_xtra is called shortly after switch_mm during context switch, so the cacheline is likely to be hot. Signed-off-by: Andy Lutomirski Reviewed-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Kees Cook Cc: Andrea Arcangeli Cc: Vince Weaver Cc: "hillf.zj" Cc: Valdis Kletnieks Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Link: http://lkml.kernel.org/r/3a54dd3353fffbf84804398e00dfdc5b7c1afd7d.1414190806.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 2 +- arch/x86/mm/init.c | 9 +++++++++ arch/x86/mm/tlb.c | 3 --- 3 files changed, 10 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index e3ff27a..ede025f 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -600,7 +600,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code, printk(nx_warning, from_kuid(&init_user_ns, current_uid())); if (pte && pte_present(*pte) && pte_exec(*pte) && (pgd_flags(*pgd) & _PAGE_USER) && - (read_cr4() & X86_CR4_SMEP)) + (__read_cr4() & X86_CR4_SMEP)) printk(smep_warning, from_kuid(&init_user_ns, current_uid())); } diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index d4eddbd..a74aa0f 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -713,6 +713,15 @@ void __init zone_sizes_init(void) free_area_init_nodes(max_zone_pfns); } +DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = { +#ifdef CONFIG_SMP + .active_mm = &init_mm, + .state = 0, +#endif + .cr4 = ~0UL, /* fail hard if we screw up cr4 shadow initialization */ +}; +EXPORT_SYMBOL_GPL(cpu_tlbstate); + void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache) { /* entry 0 MUST be WB (hardwired to speed up translations) */ diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index ee61c36..3250f23 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -14,9 +14,6 @@ #include #include -DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) - = { &init_mm, 0, }; - /* * Smarter SMP flushing macros. * c/o Linus Torvalds. -- cgit v1.1 From ece84b390ab0ceada9c771749455f3594c36e3df Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Tue, 10 Feb 2015 14:08:19 -0800 Subject: hugetlb, x86: register 1G page size if we can allocate them at runtime After commit 944d9fec8d7a ("hugetlb: add support for gigantic page allocation at runtime") we can allocate 1G pages at runtime if CMA is enabled. Let's register 1G pages into hugetlb even if the user hasn't requested them explicitly at boot time with hugepagesz=1G. Signed-off-by: Kirill A. Shutemov Reviewed-by: Luiz Capitulino Cc: Naoya Horiguchi Cc: Andi Kleen Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/hugetlbpage.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index 8b977eb..bca0aa3 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -178,4 +178,15 @@ static __init int setup_hugepagesz(char *opt) return 1; } __setup("hugepagesz=", setup_hugepagesz); + +#ifdef CONFIG_CMA +static __init int gigantic_pages_init(void) +{ + /* With CMA we can allocate gigantic pages at runtime */ + if (cpu_has_gbpages && !size_to_hstate(1UL << PUD_SHIFT)) + hugetlb_add_hstate(PUD_SHIFT - PAGE_SHIFT); + return 0; +} +arch_initcall(gigantic_pages_init); +#endif #endif -- cgit v1.1 From 61f77eda9bbf0d2e922197ed2dcf88638a639ce5 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Feb 2015 15:25:15 -0800 Subject: mm/hugetlb: reduce arch dependent code around follow_huge_* Currently we have many duplicates in definitions around follow_huge_addr(), follow_huge_pmd(), and follow_huge_pud(), so this patch tries to remove the m. The basic idea is to put the default implementation for these functions in mm/hugetlb.c as weak symbols (regardless of CONFIG_ARCH_WANT_GENERAL_HUGETL B), and to implement arch-specific code only when the arch needs it. For follow_huge_addr(), only powerpc and ia64 have their own implementation, and in all other architectures this function just returns ERR_PTR(-EINVAL). So this patch sets returning ERR_PTR(-EINVAL) as default. As for follow_huge_(pmd|pud)(), if (pmd|pud)_huge() is implemented to always return 0 in your architecture (like in ia64 or sparc,) it's never called (the callsite is optimized away) no matter how implemented it is. So in such architectures, we don't need arch-specific implementation. In some architecture (like mips, s390 and tile,) their current arch-specific follow_huge_(pmd|pud)() are effectively identical with the common code, so this patch lets these architecture use the common code. One exception is metag, where pmd_huge() could return non-zero but it expects follow_huge_pmd() to always return NULL. This means that we need arch-specific implementation which returns NULL. This behavior looks strange to me (because non-zero pmd_huge() implies that the architecture supports PMD-based hugepage, so follow_huge_pmd() can/should return some relevant value,) but that's beyond this cleanup patch, so let's keep it. Justification of non-trivial changes: - in s390, follow_huge_pmd() checks !MACHINE_HAS_HPAGE at first, and this patch removes the check. This is OK because we can assume MACHINE_HAS_HPAGE is true when follow_huge_pmd() can be called (note that pmd_huge() has the same check and always returns 0 for !MACHINE_HAS_HPAGE.) - in s390 and mips, we use HPAGE_MASK instead of PMD_MASK as done in common code. This patch forces these archs use PMD_MASK, but it's OK because they are identical in both archs. In s390, both of HPAGE_SHIFT and PMD_SHIFT are 20. In mips, HPAGE_SHIFT is defined as (PAGE_SHIFT + PAGE_SHIFT - 3) and PMD_SHIFT is define as (PAGE_SHIFT + PAGE_SHIFT + PTE_ORDER - 3), but PTE_ORDER is always 0, so these are identical. Signed-off-by: Naoya Horiguchi Acked-by: Hugh Dickins Cc: James Hogan Cc: David Rientjes Cc: Mel Gorman Cc: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Cc: Andrea Arcangeli Cc: Luiz Capitulino Cc: Nishanth Aravamudan Cc: Lee Schermerhorn Cc: Steve Capper Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/hugetlbpage.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index bca0aa3..f48423f 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -52,20 +52,8 @@ int pud_huge(pud_t pud) return 0; } -struct page * -follow_huge_pmd(struct mm_struct *mm, unsigned long address, - pmd_t *pmd, int write) -{ - return NULL; -} #else -struct page * -follow_huge_addr(struct mm_struct *mm, unsigned long address, int write) -{ - return ERR_PTR(-EINVAL); -} - int pmd_huge(pmd_t pmd) { return !!(pmd_val(pmd) & _PAGE_PSE); -- cgit v1.1 From cbef8478bee55775ac312a574aad48af7bb9cf9f Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Feb 2015 15:25:19 -0800 Subject: mm/hugetlb: pmd_huge() returns true for non-present hugepage Migrating hugepages and hwpoisoned hugepages are considered as non-present hugepages, and they are referenced via migration entries and hwpoison entries in their page table slots. This behavior causes race condition because pmd_huge() doesn't tell non-huge pages from migrating/hwpoisoned hugepages. follow_page_mask() is one example where the kernel would call follow_page_pte() for such hugepage while this function is supposed to handle only normal pages. To avoid this, this patch makes pmd_huge() return true when pmd_none() is true *and* pmd_present() is false. We don't have to worry about mixing up non-present pmd entry with normal pmd (pointing to leaf level pte entry) because pmd_present() is true in normal pmd. The same race condition could happen in (x86-specific) gup_pmd_range(), where this patch simply adds pmd_present() check instead of pmd_huge(). This is because gup_pmd_range() is fast path. If we have non-present hugepage in this function, we will go into gup_huge_pmd(), then return 0 at flag mask check, and finally fall back to the slow path. Fixes: 290408d4a2 ("hugetlb: hugepage migration core") Signed-off-by: Naoya Horiguchi Cc: Hugh Dickins Cc: James Hogan Cc: David Rientjes Cc: Mel Gorman Cc: Johannes Weiner Cc: Michal Hocko Cc: Rik van Riel Cc: Andrea Arcangeli Cc: Luiz Capitulino Cc: Nishanth Aravamudan Cc: Lee Schermerhorn Cc: Steve Capper Cc: [2.6.36+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/gup.c | 2 +- arch/x86/mm/hugetlbpage.c | 8 +++++++- 2 files changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index d754782..224b142 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -172,7 +172,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, */ if (pmd_none(pmd) || pmd_trans_splitting(pmd)) return 0; - if (unlikely(pmd_large(pmd))) { + if (unlikely(pmd_large(pmd) || !pmd_present(pmd))) { /* * NUMA hinting faults need to be handled in the GUP * slowpath for accounting purposes and so that they diff --git a/arch/x86/mm/hugetlbpage.c b/arch/x86/mm/hugetlbpage.c index f48423f..42982b2 100644 --- a/arch/x86/mm/hugetlbpage.c +++ b/arch/x86/mm/hugetlbpage.c @@ -54,9 +54,15 @@ int pud_huge(pud_t pud) #else +/* + * pmd_huge() returns 1 if @pmd is hugetlb related entry, that is normal + * hugetlb entry or non-present (migration or hwpoisoned) hugetlb entry. + * Otherwise, returns 0. + */ int pmd_huge(pmd_t pmd) { - return !!(pmd_val(pmd) & _PAGE_PSE); + return !pmd_none(pmd) && + (pmd_val(pmd) & (_PAGE_PRESENT|_PAGE_PSE)) != _PAGE_PRESENT; } int pud_huge(pud_t pud) -- cgit v1.1 From dc6c9a35b66b520cf67e05d8ca60ebecad3b0479 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 11 Feb 2015 15:26:50 -0800 Subject: mm: account pmd page tables to the process Dave noticed that unprivileged process can allocate significant amount of memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and memory cgroup. The trick is to allocate a lot of PMD page tables. Linux kernel doesn't account PMD tables to the process, only PTE. The use-cases below use few tricks to allocate a lot of PMD page tables while keeping VmRSS and VmPTE low. oom_score for the process will be 0. #include #include #include #include #include #include #define PUD_SIZE (1UL << 30) #define PMD_SIZE (1UL << 21) #define NR_PUD 130000 int main(void) { char *addr = NULL; unsigned long i; prctl(PR_SET_THP_DISABLE); for (i = 0; i < NR_PUD ; i++) { addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); if (addr == MAP_FAILED) { perror("mmap"); break; } *addr = 'x'; munmap(addr, PMD_SIZE); mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0); if (addr == MAP_FAILED) perror("re-mmap"), exit(1); } printf("PID %d consumed %lu KiB in PMD page tables\n", getpid(), i * 4096 >> 10); return pause(); } The patch addresses the issue by account PMD tables to the process the same way we account PTE. The main place where PMD tables is accounted is __pmd_alloc() and free_pmd_range(). But there're few corner cases: - HugeTLB can share PMD page tables. The patch handles by accounting the table to all processes who share it. - x86 PAE pre-allocates few PMD tables on fork. - Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity check on exit(2). Accounting only happens on configuration where PMD page table's level is present (PMD is not folded). As with nr_ptes we use per-mm counter. The counter value is used to calculate baseline for badness score by oom-killer. Signed-off-by: Kirill A. Shutemov Reported-by: Dave Hansen Cc: Hugh Dickins Reviewed-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: David Rientjes Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pgtable.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 6fb6927..7b22ada 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -190,7 +190,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) #endif /* CONFIG_X86_PAE */ -static void free_pmds(pmd_t *pmds[]) +static void free_pmds(struct mm_struct *mm, pmd_t *pmds[]) { int i; @@ -198,10 +198,11 @@ static void free_pmds(pmd_t *pmds[]) if (pmds[i]) { pgtable_pmd_page_dtor(virt_to_page(pmds[i])); free_page((unsigned long)pmds[i]); + mm_dec_nr_pmds(mm); } } -static int preallocate_pmds(pmd_t *pmds[]) +static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) { int i; bool failed = false; @@ -215,11 +216,13 @@ static int preallocate_pmds(pmd_t *pmds[]) pmd = NULL; failed = true; } + if (pmd) + mm_inc_nr_pmds(mm); pmds[i] = pmd; } if (failed) { - free_pmds(pmds); + free_pmds(mm, pmds); return -ENOMEM; } @@ -246,6 +249,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); pmd_free(mm, pmd); + mm_dec_nr_pmds(mm); } } } @@ -283,7 +287,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) mm->pgd = pgd; - if (preallocate_pmds(pmds) != 0) + if (preallocate_pmds(mm, pmds) != 0) goto out_free_pgd; if (paravirt_pgd_alloc(mm) != 0) @@ -304,7 +308,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) return pgd; out_free_pmds: - free_pmds(pmds); + free_pmds(mm, pmds); out_free_pgd: free_page((unsigned long)pgd); out: -- cgit v1.1 From a7b780750e1a1c7833812681e1f8fa30bbb06802 Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Wed, 11 Feb 2015 15:27:23 -0800 Subject: mm: gup: use get_user_pages_unlocked within get_user_pages_fast This allows the get_user_pages_fast slow path to release the mmap_sem before blocking. Signed-off-by: Andrea Arcangeli Reviewed-by: Kirill A. Shutemov Cc: Andres Lagar-Cavilla Cc: Peter Feiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/gup.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 224b142..89df70e 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -388,10 +388,9 @@ slow_irqon: start += nr << PAGE_SHIFT; pages += nr; - down_read(&mm->mmap_sem); - ret = get_user_pages(current, mm, start, - (end - start) >> PAGE_SHIFT, write, 0, pages, NULL); - up_read(&mm->mmap_sem); + ret = get_user_pages_unlocked(current, mm, start, + (end - start) >> PAGE_SHIFT, + write, 0, pages); /* Have to be a bit careful with return values */ if (nr > 0) { -- cgit v1.1 From 8a0516ed8b90c95ffa1363b420caa37418149f21 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Thu, 12 Feb 2015 14:58:22 -0800 Subject: mm: convert p[te|md]_numa users to p[te|md]_protnone_numa Convert existing users of pte_numa and friends to the new helper. Note that the kernel is broken after this patch is applied until the other page table modifiers are also altered. This patch layout is to make review easier. Signed-off-by: Mel Gorman Acked-by: Linus Torvalds Acked-by: Aneesh Kumar Acked-by: Benjamin Herrenschmidt Tested-by: Sasha Levin Cc: Dave Jones Cc: Hugh Dickins Cc: Ingo Molnar Cc: Kirill Shutemov Cc: Paul Mackerras Cc: Rik van Riel Cc: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/gup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/gup.c b/arch/x86/mm/gup.c index 89df70e..81bf3d2 100644 --- a/arch/x86/mm/gup.c +++ b/arch/x86/mm/gup.c @@ -84,7 +84,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, struct page *page; /* Similar to the PMD case, NUMA hinting must take slow path */ - if (pte_numa(pte)) { + if (pte_protnone(pte)) { pte_unmap(ptep); return 0; } @@ -178,7 +178,7 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end, * slowpath for accounting purposes and so that they * can be serialised against THP migration. */ - if (pmd_numa(pmd)) + if (pmd_protnone(pmd)) return 0; if (!gup_huge_pmd(pmd, addr, next, write, pages, nr)) return 0; -- cgit v1.1 From bf58b4879c33b3475a33740562ebf6583f531d4a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 13 Feb 2015 14:37:12 -0800 Subject: x86: use %*pb[l] to print bitmaps including cpumasks and nodemasks printk and friends can now format bitmaps using '%*pb[l]'. cpumask and nodemask also provide cpumask_pr_args() and nodemask_pr_args() respectively which can be used to generate the two printf arguments necessary to format the specified cpu/nodemask. * Unnecessary buffer size calculation and condition on the lenght removed from intel_cacheinfo.c::show_shared_cpu_map_func(). * uv_nmi_nr_cpus_pr() got overly smart and implemented "..." abbreviation if the output stretched over the predefined 1024 byte buffer. Replaced with plain printk. Signed-off-by: Tejun Heo Cc: Mike Travis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/numa.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/numa.c b/arch/x86/mm/numa.c index 1a88370..cd4785b 100644 --- a/arch/x86/mm/numa.c +++ b/arch/x86/mm/numa.c @@ -794,7 +794,6 @@ int early_cpu_to_node(int cpu) void debug_cpumask_set_cpu(int cpu, int node, bool enable) { struct cpumask *mask; - char buf[64]; if (node == NUMA_NO_NODE) { /* early_cpu_to_node() already emits a warning and trace */ @@ -812,10 +811,9 @@ void debug_cpumask_set_cpu(int cpu, int node, bool enable) else cpumask_clear_cpu(cpu, mask); - cpulist_scnprintf(buf, sizeof(buf), mask); - printk(KERN_DEBUG "%s cpu %d node %d: mask now %s\n", + printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n", enable ? "numa_add_cpu" : "numa_remove_cpu", - cpu, node, buf); + cpu, node, cpumask_pr_args(mask)); return; } -- cgit v1.1 From ef7f0d6a6ca8c9e4b27d78895af86c2fbfaeedb2 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 13 Feb 2015 14:39:25 -0800 Subject: x86_64: add KASan support This patch adds arch specific code for kernel address sanitizer. 16TB of virtual addressed used for shadow memory. It's located in range [ffffec0000000000 - fffffc0000000000] between vmemmap and %esp fixup stacks. At early stage we map whole shadow region with zero page. Latter, after pages mapped to direct mapping address range we unmap zero pages from corresponding shadow (see kasan_map_shadow()) and allocate and map a real shadow memory reusing vmemmap_populate() function. Also replace __pa with __pa_nodebug before shadow initialized. __pa with CONFIG_DEBUG_VIRTUAL=y make external function call (__phys_addr) __phys_addr is instrumented, so __asan_load could be called before shadow area initialized. Signed-off-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Konstantin Serebryany Cc: Dmitry Chernenkov Signed-off-by: Andrey Konovalov Cc: Yuri Gribov Cc: Konstantin Khlebnikov Cc: Sasha Levin Cc: Christoph Lameter Cc: Joonsoo Kim Cc: Dave Hansen Cc: Andi Kleen Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Jim Davis Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/Makefile | 3 + arch/x86/mm/kasan_init_64.c | 199 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 202 insertions(+) create mode 100644 arch/x86/mm/kasan_init_64.c (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index ecfdc46..c4cc740 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -20,6 +20,9 @@ obj-$(CONFIG_HIGHMEM) += highmem_32.o obj-$(CONFIG_KMEMCHECK) += kmemcheck/ +KASAN_SANITIZE_kasan_init_$(BITS).o := n +obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o + obj-$(CONFIG_MMIOTRACE) += mmiotrace.o mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c new file mode 100644 index 0000000..3e4d9a1 --- /dev/null +++ b/arch/x86/mm/kasan_init_64.c @@ -0,0 +1,199 @@ +#include +#include +#include +#include +#include +#include + +#include +#include + +extern pgd_t early_level4_pgt[PTRS_PER_PGD]; +extern struct range pfn_mapped[E820_X_MAX]; + +extern unsigned char kasan_zero_page[PAGE_SIZE]; + +static int __init map_range(struct range *range) +{ + unsigned long start; + unsigned long end; + + start = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->start)); + end = (unsigned long)kasan_mem_to_shadow(pfn_to_kaddr(range->end)); + + /* + * end + 1 here is intentional. We check several shadow bytes in advance + * to slightly speed up fastpath. In some rare cases we could cross + * boundary of mapped shadow, so we just map some more here. + */ + return vmemmap_populate(start, end + 1, NUMA_NO_NODE); +} + +static void __init clear_pgds(unsigned long start, + unsigned long end) +{ + for (; start < end; start += PGDIR_SIZE) + pgd_clear(pgd_offset_k(start)); +} + +void __init kasan_map_early_shadow(pgd_t *pgd) +{ + int i; + unsigned long start = KASAN_SHADOW_START; + unsigned long end = KASAN_SHADOW_END; + + for (i = pgd_index(start); start < end; i++) { + pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) + | _KERNPG_TABLE); + start += PGDIR_SIZE; + } +} + +static int __init zero_pte_populate(pmd_t *pmd, unsigned long addr, + unsigned long end) +{ + pte_t *pte = pte_offset_kernel(pmd, addr); + + while (addr + PAGE_SIZE <= end) { + WARN_ON(!pte_none(*pte)); + set_pte(pte, __pte(__pa_nodebug(kasan_zero_page) + | __PAGE_KERNEL_RO)); + addr += PAGE_SIZE; + pte = pte_offset_kernel(pmd, addr); + } + return 0; +} + +static int __init zero_pmd_populate(pud_t *pud, unsigned long addr, + unsigned long end) +{ + int ret = 0; + pmd_t *pmd = pmd_offset(pud, addr); + + while (IS_ALIGNED(addr, PMD_SIZE) && addr + PMD_SIZE <= end) { + WARN_ON(!pmd_none(*pmd)); + set_pmd(pmd, __pmd(__pa_nodebug(kasan_zero_pte) + | __PAGE_KERNEL_RO)); + addr += PMD_SIZE; + pmd = pmd_offset(pud, addr); + } + if (addr < end) { + if (pmd_none(*pmd)) { + void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); + if (!p) + return -ENOMEM; + set_pmd(pmd, __pmd(__pa_nodebug(p) | _KERNPG_TABLE)); + } + ret = zero_pte_populate(pmd, addr, end); + } + return ret; +} + + +static int __init zero_pud_populate(pgd_t *pgd, unsigned long addr, + unsigned long end) +{ + int ret = 0; + pud_t *pud = pud_offset(pgd, addr); + + while (IS_ALIGNED(addr, PUD_SIZE) && addr + PUD_SIZE <= end) { + WARN_ON(!pud_none(*pud)); + set_pud(pud, __pud(__pa_nodebug(kasan_zero_pmd) + | __PAGE_KERNEL_RO)); + addr += PUD_SIZE; + pud = pud_offset(pgd, addr); + } + + if (addr < end) { + if (pud_none(*pud)) { + void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); + if (!p) + return -ENOMEM; + set_pud(pud, __pud(__pa_nodebug(p) | _KERNPG_TABLE)); + } + ret = zero_pmd_populate(pud, addr, end); + } + return ret; +} + +static int __init zero_pgd_populate(unsigned long addr, unsigned long end) +{ + int ret = 0; + pgd_t *pgd = pgd_offset_k(addr); + + while (IS_ALIGNED(addr, PGDIR_SIZE) && addr + PGDIR_SIZE <= end) { + WARN_ON(!pgd_none(*pgd)); + set_pgd(pgd, __pgd(__pa_nodebug(kasan_zero_pud) + | __PAGE_KERNEL_RO)); + addr += PGDIR_SIZE; + pgd = pgd_offset_k(addr); + } + + if (addr < end) { + if (pgd_none(*pgd)) { + void *p = vmemmap_alloc_block(PAGE_SIZE, NUMA_NO_NODE); + if (!p) + return -ENOMEM; + set_pgd(pgd, __pgd(__pa_nodebug(p) | _KERNPG_TABLE)); + } + ret = zero_pud_populate(pgd, addr, end); + } + return ret; +} + + +static void __init populate_zero_shadow(const void *start, const void *end) +{ + if (zero_pgd_populate((unsigned long)start, (unsigned long)end)) + panic("kasan: unable to map zero shadow!"); +} + + +#ifdef CONFIG_KASAN_INLINE +static int kasan_die_handler(struct notifier_block *self, + unsigned long val, + void *data) +{ + if (val == DIE_GPF) { + pr_emerg("CONFIG_KASAN_INLINE enabled"); + pr_emerg("GPF could be caused by NULL-ptr deref or user memory access"); + } + return NOTIFY_OK; +} + +static struct notifier_block kasan_die_notifier = { + .notifier_call = kasan_die_handler, +}; +#endif + +void __init kasan_init(void) +{ + int i; + +#ifdef CONFIG_KASAN_INLINE + register_die_notifier(&kasan_die_notifier); +#endif + + memcpy(early_level4_pgt, init_level4_pgt, sizeof(early_level4_pgt)); + load_cr3(early_level4_pgt); + + clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END); + + populate_zero_shadow((void *)KASAN_SHADOW_START, + kasan_mem_to_shadow((void *)PAGE_OFFSET)); + + for (i = 0; i < E820_X_MAX; i++) { + if (pfn_mapped[i].end == 0) + break; + + if (map_range(&pfn_mapped[i])) + panic("kasan: unable to allocate shadow!"); + } + + populate_zero_shadow(kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), + (void *)KASAN_SHADOW_END); + + memset(kasan_zero_page, 0, PAGE_SIZE); + + load_cr3(init_level4_pgt); +} -- cgit v1.1 From c420f167db8c799d69fe43a801c58a7f02e9d57c Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 13 Feb 2015 14:39:59 -0800 Subject: kasan: enable stack instrumentation Stack instrumentation allows to detect out of bounds memory accesses for variables allocated on stack. Compiler adds redzones around every variable on stack and poisons redzones in function's prologue. Such approach significantly increases stack usage, so all in-kernel stacks size were doubled. Signed-off-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Konstantin Serebryany Cc: Dmitry Chernenkov Signed-off-by: Andrey Konovalov Cc: Yuri Gribov Cc: Konstantin Khlebnikov Cc: Sasha Levin Cc: Christoph Lameter Cc: Joonsoo Kim Cc: Dave Hansen Cc: Andi Kleen Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/kasan_init_64.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 3e4d9a1..5350870 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -189,11 +189,18 @@ void __init kasan_init(void) if (map_range(&pfn_mapped[i])) panic("kasan: unable to allocate shadow!"); } - populate_zero_shadow(kasan_mem_to_shadow((void *)PAGE_OFFSET + MAXMEM), - (void *)KASAN_SHADOW_END); + kasan_mem_to_shadow((void *)__START_KERNEL_map)); + + vmemmap_populate((unsigned long)kasan_mem_to_shadow(_stext), + (unsigned long)kasan_mem_to_shadow(_end), + NUMA_NO_NODE); + + populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_VADDR), + (void *)KASAN_SHADOW_END); memset(kasan_zero_page, 0, PAGE_SIZE); load_cr3(init_level4_pgt); + init_task.kasan_depth = 0; } -- cgit v1.1 From bebf56a1b176c2e1c9efe44e7e6915532cc682cf Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Fri, 13 Feb 2015 14:40:17 -0800 Subject: kasan: enable instrumentation of global variables This feature let us to detect accesses out of bounds of global variables. This will work as for globals in kernel image, so for globals in modules. Currently this won't work for symbols in user-specified sections (e.g. __init, __read_mostly, ...) The idea of this is simple. Compiler increases each global variable by redzone size and add constructors invoking __asan_register_globals() function. Information about global variable (address, size, size with redzone ...) passed to __asan_register_globals() so we could poison variable's redzone. This patch also forces module_alloc() to return 8*PAGE_SIZE aligned address making shadow memory handling ( kasan_module_alloc()/kasan_module_free() ) more simple. Such alignment guarantees that each shadow page backing modules address space correspond to only one module_alloc() allocation. Signed-off-by: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Konstantin Serebryany Cc: Dmitry Chernenkov Signed-off-by: Andrey Konovalov Cc: Yuri Gribov Cc: Konstantin Khlebnikov Cc: Sasha Levin Cc: Christoph Lameter Cc: Joonsoo Kim Cc: Dave Hansen Cc: Andi Kleen Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/kasan_init_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 5350870..4860906 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -196,7 +196,7 @@ void __init kasan_init(void) (unsigned long)kasan_mem_to_shadow(_end), NUMA_NO_NODE); - populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_VADDR), + populate_zero_shadow(kasan_mem_to_shadow((void *)MODULES_END), (void *)KASAN_SHADOW_END); memset(kasan_zero_page, 0, PAGE_SIZE); -- cgit v1.1 From f15e05186c3244e9195378a0a568283a8ccc60b0 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Tue, 10 Feb 2015 13:20:30 -0800 Subject: x86/mm/init: Fix incorrect page size in init_memory_mapping() printks With 32-bit non-PAE kernels, we have 2 page sizes available (at most): 4k and 4M. Enabling PAE replaces that 4M size with a 2M one (which 64-bit systems use too). But, when booting a 32-bit non-PAE kernel, in one of our early-boot printouts, we say: init_memory_mapping: [mem 0x00000000-0x000fffff] [mem 0x00000000-0x000fffff] page 4k init_memory_mapping: [mem 0x37000000-0x373fffff] [mem 0x37000000-0x373fffff] page 2M init_memory_mapping: [mem 0x00100000-0x36ffffff] [mem 0x00100000-0x003fffff] page 4k [mem 0x00400000-0x36ffffff] page 2M init_memory_mapping: [mem 0x37400000-0x377fdfff] [mem 0x37400000-0x377fdfff] page 4k Which is obviously wrong. There is no 2M page available. This is probably because of a badly-named variable: in the map_range code: PG_LEVEL_2M. Instead of renaming all the PG_LEVEL_2M's. This patch just fixes the printout: init_memory_mapping: [mem 0x00000000-0x000fffff] [mem 0x00000000-0x000fffff] page 4k init_memory_mapping: [mem 0x37000000-0x373fffff] [mem 0x37000000-0x373fffff] page 4M init_memory_mapping: [mem 0x00100000-0x36ffffff] [mem 0x00100000-0x003fffff] page 4k [mem 0x00400000-0x36ffffff] page 4M init_memory_mapping: [mem 0x37400000-0x377fdfff] [mem 0x37400000-0x377fdfff] page 4k BRK [0x03206000, 0x03206fff] PGTABLE Signed-off-by: Dave Hansen Cc: Pekka Enberg Cc: Yinghai Lu Link: http://lkml.kernel.org/r/20150210212030.665EC267@viggo.jf.intel.com Signed-off-by: Borislav Petkov --- arch/x86/mm/init.c | 28 ++++++++++++++++++++++++++-- 1 file changed, 26 insertions(+), 2 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index 079c3b6..7ff2424 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -238,6 +238,31 @@ static void __init_refok adjust_range_page_size_mask(struct map_range *mr, } } +static const char *page_size_string(struct map_range *mr) +{ + static const char str_1g[] = "1G"; + static const char str_2m[] = "2M"; + static const char str_4m[] = "4M"; + static const char str_4k[] = "4k"; + + if (mr->page_size_mask & (1<page_size_mask & (1<page_size_mask & (1< Date: Sat, 14 Feb 2015 09:33:50 -0800 Subject: x86, mm/ASLR: Fix stack randomization on 64-bit systems The issue is that the stack for processes is not properly randomized on 64 bit architectures due to an integer overflow. The affected function is randomize_stack_top() in file "fs/binfmt_elf.c": static unsigned long randomize_stack_top(unsigned long stack_top) { unsigned int random_variable = 0; if ((current->flags & PF_RANDOMIZE) && !(current->personality & ADDR_NO_RANDOMIZE)) { random_variable = get_random_int() & STACK_RND_MASK; random_variable <<= PAGE_SHIFT; } return PAGE_ALIGN(stack_top) + random_variable; return PAGE_ALIGN(stack_top) - random_variable; } Note that, it declares the "random_variable" variable as "unsigned int". Since the result of the shifting operation between STACK_RND_MASK (which is 0x3fffff on x86_64, 22 bits) and PAGE_SHIFT (which is 12 on x86_64): random_variable <<= PAGE_SHIFT; then the two leftmost bits are dropped when storing the result in the "random_variable". This variable shall be at least 34 bits long to hold the (22+12) result. These two dropped bits have an impact on the entropy of process stack. Concretely, the total stack entropy is reduced by four: from 2^28 to 2^30 (One fourth of expected entropy). This patch restores back the entropy by correcting the types involved in the operations in the functions randomize_stack_top() and stack_maxrandom_size(). The successful fix can be tested with: $ for i in `seq 1 10`; do cat /proc/self/maps | grep stack; done 7ffeda566000-7ffeda587000 rw-p 00000000 00:00 0 [stack] 7fff5a332000-7fff5a353000 rw-p 00000000 00:00 0 [stack] 7ffcdb7a1000-7ffcdb7c2000 rw-p 00000000 00:00 0 [stack] 7ffd5e2c4000-7ffd5e2e5000 rw-p 00000000 00:00 0 [stack] ... Once corrected, the leading bytes should be between 7ffc and 7fff, rather than always being 7fff. Signed-off-by: Hector Marco-Gisbert Signed-off-by: Ismael Ripoll [ Rebased, fixed 80 char bugs, cleaned up commit message, added test example and CVE ] Signed-off-by: Kees Cook Cc: Cc: Linus Torvalds Cc: Andrew Morton Cc: Al Viro Fixes: CVE-2015-1593 Link: http://lkml.kernel.org/r/20150214173350.GA18393@www.outflux.net Signed-off-by: Borislav Petkov --- arch/x86/mm/mmap.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/mm') diff --git a/arch/x86/mm/mmap.c b/arch/x86/mm/mmap.c index 919b912..df4552b 100644 --- a/arch/x86/mm/mmap.c +++ b/arch/x86/mm/mmap.c @@ -35,12 +35,12 @@ struct va_alignment __read_mostly va_align = { .flags = -1, }; -static unsigned int stack_maxrandom_size(void) +static unsigned long stack_maxrandom_size(void) { - unsigned int max = 0; + unsigned long max = 0; if ((current->flags & PF_RANDOMIZE) && !(current->personality & ADDR_NO_RANDOMIZE)) { - max = ((-1U) & STACK_RND_MASK) << PAGE_SHIFT; + max = ((-1UL) & STACK_RND_MASK) << PAGE_SHIFT; } return max; -- cgit v1.1