From 1f5026a7e21e409c2b9dd54f6dfb9446511fb7c5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Jul 2011 09:58:09 +0200 Subject: memblock: Kill MEMBLOCK_ERROR 25818f0f28 (memblock: Make MEMBLOCK_ERROR be 0) thankfully made MEMBLOCK_ERROR 0 and there already are codes which expect error return to be 0. There's no point in keeping MEMBLOCK_ERROR around. End its misery. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1310457490-3356-6-git-send-email-tj@kernel.org Cc: Yinghai Lu Cc: Benjamin Herrenschmidt Signed-off-by: H. Peter Anvin --- arch/x86/kernel/aperture_64.c | 2 +- arch/x86/kernel/check.c | 2 +- arch/x86/kernel/e820.c | 2 +- arch/x86/kernel/setup.c | 4 ++-- arch/x86/kernel/trampoline.c | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 3d2661c..5636308 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -88,7 +88,7 @@ static u32 __init allocate_aperture(void) */ addr = memblock_find_in_range(GART_MIN_ADDR, GART_MAX_ADDR, aper_size, aper_size); - if (addr == MEMBLOCK_ERROR || addr + aper_size > GART_MAX_ADDR) { + if (!addr || addr + aper_size > GART_MAX_ADDR) { printk(KERN_ERR "Cannot allocate aperture memory hole (%lx,%uK)\n", addr, aper_size>>10); diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 452932d..95680fc 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -86,7 +86,7 @@ void __init setup_bios_corruption_check(void) u64 size; addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE); - if (addr == MEMBLOCK_ERROR) + if (!addr) break; if (addr >= corruption_check_size) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 3e2ef84..0f9ff58 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -745,7 +745,7 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) for (start = startt; ; start += size) { start = memblock_x86_find_in_range_size(start, &size, align); - if (start == MEMBLOCK_ERROR) + if (!start) return 0; if (size >= sizet) break; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index afaf384..31ffe20 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -331,7 +331,7 @@ static void __init relocate_initrd(void) ramdisk_here = memblock_find_in_range(0, end_of_lowmem, area_size, PAGE_SIZE); - if (ramdisk_here == MEMBLOCK_ERROR) + if (!ramdisk_here) panic("Cannot find place for new RAMDISK of size %lld\n", ramdisk_size); @@ -554,7 +554,7 @@ static void __init reserve_crashkernel(void) crash_base = memblock_find_in_range(alignment, CRASH_KERNEL_ADDR_MAX, crash_size, alignment); - if (crash_base == MEMBLOCK_ERROR) { + if (!crash_base) { pr_info("crashkernel reservation failed - No suitable area found.\n"); return; } diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index a91ae77..a1f13dd 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -14,7 +14,7 @@ void __init setup_trampolines(void) /* Has to be in very low memory so we can execute real-mode AP code. */ mem = memblock_find_in_range(0, 1<<20, size, PAGE_SIZE); - if (mem == MEMBLOCK_ERROR) + if (!mem) panic("Cannot allocate trampoline\n"); x86_trampoline_base = __va(mem); -- cgit v1.1 From ab5d140b9eafae402aa3e673a63c5ef6164a9dd2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Jul 2011 11:15:58 +0200 Subject: x86: Use __memblock_alloc_base() in early_reserve_e820() early_reserve_e820() implements its own ad-hoc early allocator using memblock_x86_find_in_range_size(). Use __memblock_alloc_base() instead and remove the unnecessary @startt parameter (it's top-down allocation anyway). Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1310462166-31469-6-git-send-email-tj@kernel.org Cc: Yinghai Lu Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 30 ++++++------------------------ arch/x86/kernel/mpparse.c | 6 ++---- 2 files changed, 8 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 0f9ff58..b99d940 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -737,35 +737,17 @@ core_initcall(e820_mark_nvs_memory); /* * pre allocated 4k and reserved it in memblock and e820_saved */ -u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) +u64 __init early_reserve_e820(u64 size, u64 align) { - u64 size = 0; u64 addr; - u64 start; - for (start = startt; ; start += size) { - start = memblock_x86_find_in_range_size(start, &size, align); - if (!start) - return 0; - if (size >= sizet) - break; + addr = __memblock_alloc_base(size, align, MEMBLOCK_ALLOC_ACCESSIBLE); + if (addr) { + e820_update_range_saved(addr, size, E820_RAM, E820_RESERVED); + printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); + update_e820_saved(); } -#ifdef CONFIG_X86_32 - if (start >= MAXMEM) - return 0; - if (start + size > MAXMEM) - size = MAXMEM - start; -#endif - - addr = round_down(start + size - sizet, align); - if (addr < start) - return 0; - memblock_x86_reserve_range(addr, addr + sizet, "new next"); - e820_update_range_saved(addr, sizet, E820_RAM, E820_RESERVED); - printk(KERN_INFO "update e820_saved for early_reserve_e820\n"); - update_e820_saved(); - return addr; } diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 9103b89..8faeaa0 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -836,10 +836,8 @@ early_param("alloc_mptable", parse_alloc_mptable_opt); void __init early_reserve_e820_mpc_new(void) { - if (enable_update_mptable && alloc_mptable) { - u64 startt = 0; - mpc_new_phys = early_reserve_e820(startt, mpc_new_length, 4); - } + if (enable_update_mptable && alloc_mptable) + mpc_new_phys = early_reserve_e820(mpc_new_length, 4); } static int __init update_mp_table(void) -- cgit v1.1 From 8d89ac808417e92a33fb5fa3c86352016643775a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Jul 2011 11:16:00 +0200 Subject: x86: Replace memblock_x86_find_in_range_size() with for_each_free_mem_range() setup_bios_corruption_check() and memtest do_one_pass() open code memblock free area iteration using memblock_x86_find_in_range_size(). Convert them to use for_each_free_mem_range() instead. This leaves memblock_x86_find_in_range_size() and memblock_x86_check_reserved_size() unused. Kill them. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1310462166-31469-8-git-send-email-tj@kernel.org Cc: Yinghai Lu Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: H. Peter Anvin --- arch/x86/kernel/check.c | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 95680fc..621cd23 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -62,7 +62,8 @@ early_param("memory_corruption_check_size", set_corruption_check_size); void __init setup_bios_corruption_check(void) { - u64 addr = PAGE_SIZE; /* assume first page is reserved anyway */ + phys_addr_t start, end; + u64 i; if (memory_corruption_check == -1) { memory_corruption_check = @@ -82,28 +83,23 @@ void __init setup_bios_corruption_check(void) corruption_check_size = round_up(corruption_check_size, PAGE_SIZE); - while (addr < corruption_check_size && num_scan_areas < MAX_SCAN_AREAS) { - u64 size; - addr = memblock_x86_find_in_range_size(addr, &size, PAGE_SIZE); + for_each_free_mem_range(i, MAX_NUMNODES, &start, &end, NULL) { + start = clamp_t(phys_addr_t, round_up(start, PAGE_SIZE), + PAGE_SIZE, corruption_check_size); + end = clamp_t(phys_addr_t, round_down(end, PAGE_SIZE), + PAGE_SIZE, corruption_check_size); + if (start >= end) + continue; - if (!addr) - break; - - if (addr >= corruption_check_size) - break; - - if ((addr + size) > corruption_check_size) - size = corruption_check_size - addr; - - memblock_x86_reserve_range(addr, addr + size, "SCAN RAM"); - scan_areas[num_scan_areas].addr = addr; - scan_areas[num_scan_areas].size = size; - num_scan_areas++; + memblock_x86_reserve_range(start, end, "SCAN RAM"); + scan_areas[num_scan_areas].addr = start; + scan_areas[num_scan_areas].size = end - start; /* Assume we've already mapped this early memory */ - memset(__va(addr), 0, size); + memset(__va(start), 0, end - start); - addr += size; + if (++num_scan_areas >= MAX_SCAN_AREAS) + break; } if (num_scan_areas) -- cgit v1.1 From 6b5d41a1b97f5529284f16170211b87fd60264c0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Jul 2011 11:16:03 +0200 Subject: memblock, x86: Reimplement memblock_find_dma_reserve() using iterators memblock_find_dma_reserve() wants to find out how much memory is reserved under MAX_DMA_PFN. memblock_x86_memory_[free_]in_range() are used to find out the amounts of all available and free memory in the area, which are then subtracted to find out the amount of reservation. memblock_x86_memblock_[free_]in_range() are implemented using __memblock_x86_memory_in_range() which builds ranges from memblock and then count them, which is rather unnecessarily complex. This patch open codes the counting logic directly in memblock_find_dma_reserve() using memblock iterators and removes now unused __memblock_x86_memory_in_range() and find_range_array(). Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1310462166-31469-11-git-send-email-tj@kernel.org Cc: Yinghai Lu Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: H. Peter Anvin --- arch/x86/kernel/e820.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index b99d940..84475f1 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1093,15 +1093,30 @@ void __init memblock_x86_fill(void) void __init memblock_find_dma_reserve(void) { #ifdef CONFIG_X86_64 - u64 free_size_pfn; - u64 mem_size_pfn; + u64 nr_pages = 0, nr_free_pages = 0; + unsigned long start_pfn, end_pfn; + phys_addr_t start, end; + int i; + u64 u; + /* * need to find out used area below MAX_DMA_PFN * need to use memblock to get free size in [0, MAX_DMA_PFN] * at first, and assume boot_mem will not take below MAX_DMA_PFN */ - mem_size_pfn = memblock_x86_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; - free_size_pfn = memblock_x86_free_memory_in_range(0, MAX_DMA_PFN << PAGE_SHIFT) >> PAGE_SHIFT; - set_dma_reserve(mem_size_pfn - free_size_pfn); + for_each_mem_pfn_range(i, MAX_NUMNODES, &start_pfn, &end_pfn, NULL) { + start_pfn = min_t(unsigned long, start_pfn, MAX_DMA_PFN); + end_pfn = min_t(unsigned long, end_pfn, MAX_DMA_PFN); + nr_pages += end_pfn - start_pfn; + } + + for_each_free_mem_range(u, MAX_NUMNODES, &start, &end, NULL) { + start_pfn = min_t(unsigned long, PFN_UP(start), MAX_DMA_PFN); + end_pfn = min_t(unsigned long, PFN_DOWN(end), MAX_DMA_PFN); + if (start_pfn < end_pfn) + nr_free_pages += end_pfn - start_pfn; + } + + set_dma_reserve(nr_pages - nr_free_pages); #endif } -- cgit v1.1 From 24aa07882b672fff2da2f5c955759f0bd13d32d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 Jul 2011 11:16:06 +0200 Subject: memblock, x86: Replace memblock_x86_reserve/free_range() with generic ones Other than sanity check and debug message, the x86 specific version of memblock reserve/free functions are simple wrappers around the generic versions - memblock_reserve/free(). This patch adds debug messages with caller identification to the generic versions and replaces x86 specific ones and kills them. arch/x86/include/asm/memblock.h and arch/x86/mm/memblock.c are empty after this change and removed. Signed-off-by: Tejun Heo Link: http://lkml.kernel.org/r/1310462166-31469-14-git-send-email-tj@kernel.org Cc: Yinghai Lu Cc: Benjamin Herrenschmidt Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Signed-off-by: H. Peter Anvin --- arch/x86/kernel/aperture_64.c | 2 +- arch/x86/kernel/check.c | 2 +- arch/x86/kernel/head.c | 2 +- arch/x86/kernel/head32.c | 5 +++-- arch/x86/kernel/head64.c | 5 +++-- arch/x86/kernel/mpparse.c | 6 ++---- arch/x86/kernel/setup.c | 17 ++++++++--------- arch/x86/kernel/trampoline.c | 2 +- 8 files changed, 20 insertions(+), 21 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/aperture_64.c b/arch/x86/kernel/aperture_64.c index 5636308..6e76c19 100644 --- a/arch/x86/kernel/aperture_64.c +++ b/arch/x86/kernel/aperture_64.c @@ -94,7 +94,7 @@ static u32 __init allocate_aperture(void) addr, aper_size>>10); return 0; } - memblock_x86_reserve_range(addr, addr + aper_size, "aperture64"); + memblock_reserve(addr, aper_size); /* * Kmemleak should not scan this block as it may not be mapped via the * kernel direct mapping. diff --git a/arch/x86/kernel/check.c b/arch/x86/kernel/check.c index 621cd23..5da1269 100644 --- a/arch/x86/kernel/check.c +++ b/arch/x86/kernel/check.c @@ -91,7 +91,7 @@ void __init setup_bios_corruption_check(void) if (start >= end) continue; - memblock_x86_reserve_range(start, end, "SCAN RAM"); + memblock_reserve(start, end - start); scan_areas[num_scan_areas].addr = start; scan_areas[num_scan_areas].size = end - start; diff --git a/arch/x86/kernel/head.c b/arch/x86/kernel/head.c index af0699b..48d9d4e 100644 --- a/arch/x86/kernel/head.c +++ b/arch/x86/kernel/head.c @@ -52,5 +52,5 @@ void __init reserve_ebda_region(void) lowmem = 0x9f000; /* reserve all memory between lowmem and the 1MB mark */ - memblock_x86_reserve_range(lowmem, 0x100000, "* BIOS reserved"); + memblock_reserve(lowmem, 0x100000 - lowmem); } diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index 3bb0850..be9282b 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -33,7 +33,8 @@ void __init i386_start_kernel(void) { memblock_init(); - memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); + memblock_reserve(__pa_symbol(&_text), + __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ @@ -42,7 +43,7 @@ void __init i386_start_kernel(void) u64 ramdisk_image = boot_params.hdr.ramdisk_image; u64 ramdisk_size = boot_params.hdr.ramdisk_size; u64 ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); + memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); } #endif diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index 5655c22..fd25b11 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -100,7 +100,8 @@ void __init x86_64_start_reservations(char *real_mode_data) memblock_init(); - memblock_x86_reserve_range(__pa_symbol(&_text), __pa_symbol(&__bss_stop), "TEXT DATA BSS"); + memblock_reserve(__pa_symbol(&_text), + __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); #ifdef CONFIG_BLK_DEV_INITRD /* Reserve INITRD */ @@ -109,7 +110,7 @@ void __init x86_64_start_reservations(char *real_mode_data) unsigned long ramdisk_image = boot_params.hdr.ramdisk_image; unsigned long ramdisk_size = boot_params.hdr.ramdisk_size; unsigned long ramdisk_end = PAGE_ALIGN(ramdisk_image + ramdisk_size); - memblock_x86_reserve_range(ramdisk_image, ramdisk_end, "RAMDISK"); + memblock_reserve(ramdisk_image, ramdisk_end - ramdisk_image); } #endif diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 8faeaa0..a6b79c1 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -564,9 +564,7 @@ void __init default_get_smp_config(unsigned int early) static void __init smp_reserve_memory(struct mpf_intel *mpf) { - unsigned long size = get_mpc_size(mpf->physptr); - - memblock_x86_reserve_range(mpf->physptr, mpf->physptr+size, "* MP-table mpc"); + memblock_reserve(mpf->physptr, get_mpc_size(mpf->physptr)); } static int __init smp_scan_config(unsigned long base, unsigned long length) @@ -595,7 +593,7 @@ static int __init smp_scan_config(unsigned long base, unsigned long length) mpf, (u64)virt_to_phys(mpf)); mem = virt_to_phys(mpf); - memblock_x86_reserve_range(mem, mem + sizeof(*mpf), "* MP-table mpf"); + memblock_reserve(mem, sizeof(*mpf)); if (mpf->physptr) smp_reserve_memory(mpf); diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 31ffe20..97d227e 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -306,7 +306,8 @@ static void __init cleanup_highmap(void) static void __init reserve_brk(void) { if (_brk_end > _brk_start) - memblock_x86_reserve_range(__pa(_brk_start), __pa(_brk_end), "BRK"); + memblock_reserve(__pa(_brk_start), + __pa(_brk_end) - __pa(_brk_start)); /* Mark brk area as locked down and no longer taking any new allocations */ @@ -337,7 +338,7 @@ static void __init relocate_initrd(void) /* Note: this includes all the lowmem currently occupied by the initrd, we rely on that fact to keep the data intact. */ - memblock_x86_reserve_range(ramdisk_here, ramdisk_here + area_size, "NEW RAMDISK"); + memblock_reserve(ramdisk_here, area_size); initrd_start = ramdisk_here + PAGE_OFFSET; initrd_end = initrd_start + ramdisk_size; printk(KERN_INFO "Allocated new RAMDISK: %08llx - %08llx\n", @@ -393,7 +394,7 @@ static void __init reserve_initrd(void) initrd_start = 0; if (ramdisk_size >= (end_of_lowmem>>1)) { - memblock_x86_free_range(ramdisk_image, ramdisk_end); + memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); printk(KERN_ERR "initrd too large to handle, " "disabling initrd\n"); return; @@ -416,7 +417,7 @@ static void __init reserve_initrd(void) relocate_initrd(); - memblock_x86_free_range(ramdisk_image, ramdisk_end); + memblock_free(ramdisk_image, ramdisk_end - ramdisk_image); } #else static void __init reserve_initrd(void) @@ -490,15 +491,13 @@ static void __init memblock_x86_reserve_range_setup_data(void) { struct setup_data *data; u64 pa_data; - char buf[32]; if (boot_params.hdr.version < 0x0209) return; pa_data = boot_params.hdr.setup_data; while (pa_data) { data = early_memremap(pa_data, sizeof(*data)); - sprintf(buf, "setup data %x", data->type); - memblock_x86_reserve_range(pa_data, pa_data+sizeof(*data)+data->len, buf); + memblock_reserve(pa_data, sizeof(*data) + data->len); pa_data = data->next; early_iounmap(data, sizeof(*data)); } @@ -568,7 +567,7 @@ static void __init reserve_crashkernel(void) return; } } - memblock_x86_reserve_range(crash_base, crash_base + crash_size, "CRASH KERNEL"); + memblock_reserve(crash_base, crash_size); printk(KERN_INFO "Reserving %ldMB of memory at %ldMB " "for crashkernel (System RAM: %ldMB)\n", @@ -626,7 +625,7 @@ static __init void reserve_ibft_region(void) addr = find_ibft_region(&size); if (size) - memblock_x86_reserve_range(addr, addr + size, "* ibft"); + memblock_reserve(addr, size); } static unsigned reserve_low = CONFIG_X86_RESERVE_LOW << 10; diff --git a/arch/x86/kernel/trampoline.c b/arch/x86/kernel/trampoline.c index a1f13dd..a73b610 100644 --- a/arch/x86/kernel/trampoline.c +++ b/arch/x86/kernel/trampoline.c @@ -18,7 +18,7 @@ void __init setup_trampolines(void) panic("Cannot allocate trampoline\n"); x86_trampoline_base = __va(mem); - memblock_x86_reserve_range(mem, mem + size, "TRAMPOLINE"); + memblock_reserve(mem, size); printk(KERN_DEBUG "Base memory trampoline at [%p] %llx size %zu\n", x86_trampoline_base, (unsigned long long)mem, size); -- cgit v1.1 From 57d1c0c03c6b48b2b96870d831b9ce6b917f53ac Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Oct 2011 13:36:40 +0200 Subject: perf/x86: Fix PEBS instruction unwind Masami spotted that we always try to decode the instruction stream as 64bit instructions when running a 64bit kernel, this doesn't work for ia32-compat proglets. Use TIF_IA32 to detect if we need to use the 32bit instruction decoder. Reported-by: Masami Hiramatsu Cc: stable@kernel.org Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel_ds.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index c0d238f..73da6b6 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -493,6 +493,7 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) unsigned long from = cpuc->lbr_entries[0].from; unsigned long old_to, to = cpuc->lbr_entries[0].to; unsigned long ip = regs->ip; + int is_64bit = 0; /* * We don't need to fixup if the PEBS assist is fault like @@ -544,7 +545,10 @@ static int intel_pmu_pebs_fixup_ip(struct pt_regs *regs) } else kaddr = (void *)to; - kernel_insn_init(&insn, kaddr); +#ifdef CONFIG_X86_64 + is_64bit = kernel_ip(to) || !test_thread_flag(TIF_IA32); +#endif + insn_init(&insn, kaddr, is_64bit); insn_get_length(&insn); to += insn.length; } while (to < ip); -- cgit v1.1 From aa2bc1ade59003a379ffc485d6da2d92ea3370a6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 9 Nov 2011 17:56:37 +0100 Subject: perf: Don't use -ENOSPC for out of PMU resources People (Linus) objected to using -ENOSPC to signal not having enough resources on the PMU to satisfy the request. Use -EINVAL. Requested-by: Linus Torvalds Cc: Stephane Eranian Cc: Will Deacon Cc: Deng-Cheng Zhu Cc: David Daney Cc: Ralf Baechle Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-xv8geaz2zpbjhlx0svmpp28n@git.kernel.org [ merged to newer kernel, fixed up MIPS impact ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 10 +++++----- arch/x86/kernel/cpu/perf_event_p4.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 6408910..ff0e8d4 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -588,7 +588,7 @@ done: x86_pmu.put_event_constraints(cpuc, cpuc->event_list[i]); } } - return num ? -ENOSPC : 0; + return num ? -EINVAL : 0; } /* @@ -607,7 +607,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, if (is_x86_event(leader)) { if (n >= max_count) - return -ENOSPC; + return -EINVAL; cpuc->event_list[n] = leader; n++; } @@ -620,7 +620,7 @@ static int collect_events(struct cpu_hw_events *cpuc, struct perf_event *leader, continue; if (n >= max_count) - return -ENOSPC; + return -EINVAL; cpuc->event_list[n] = event; n++; @@ -1316,7 +1316,7 @@ static int validate_event(struct perf_event *event) c = x86_pmu.get_event_constraints(fake_cpuc, event); if (!c || !c->weight) - ret = -ENOSPC; + ret = -EINVAL; if (x86_pmu.put_event_constraints) x86_pmu.put_event_constraints(fake_cpuc, event); @@ -1341,7 +1341,7 @@ static int validate_group(struct perf_event *event) { struct perf_event *leader = event->group_leader; struct cpu_hw_events *fake_cpuc; - int ret = -ENOSPC, n; + int ret = -EINVAL, n; fake_cpuc = allocate_fake_cpuc(); if (IS_ERR(fake_cpuc)) diff --git a/arch/x86/kernel/cpu/perf_event_p4.c b/arch/x86/kernel/cpu/perf_event_p4.c index 492bf13..ef484d9 100644 --- a/arch/x86/kernel/cpu/perf_event_p4.c +++ b/arch/x86/kernel/cpu/perf_event_p4.c @@ -1268,7 +1268,7 @@ reserve: } done: - return num ? -ENOSPC : 0; + return num ? -EINVAL : 0; } static __initconst const struct x86_pmu p4_pmu = { -- cgit v1.1 From ed13ec58bfe0d5dc95f748e6118432cb0fa283cb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Nov 2011 10:03:25 +0100 Subject: perf/x86: Enable raw event access to Intel offcore events Now that the core offcore support is fixed up (thanks Stephane) and we have sane generic events utilizing them, re-enable the raw access to the feature as well. Note that it doesn't matter if you use event 0x1b7 or 0x1bb to specify an offcore event, either one works and neither guarantees you'll end up on a particular offcore MSR. Based on original patch from: Vince Weaver . Signed-off-by: Peter Zijlstra Cc: Vince Weaver . Cc: Stephane Eranian Link: http://lkml.kernel.org/r/alpine.DEB.2.00.1108031200390.703@cl320.eecs.utk.edu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index ff0e8d4..2bda212 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -312,12 +312,8 @@ int x86_setup_perfctr(struct perf_event *event) return -EOPNOTSUPP; } - /* - * Do not allow config1 (extended registers) to propagate, - * there's no sane user-space generalization yet: - */ if (attr->type == PERF_TYPE_RAW) - return 0; + return x86_pmu_extra_regs(event->attr.config, event); if (attr->type == PERF_TYPE_HW_CACHE) return set_ext_hw_attr(hwc, event); -- cgit v1.1 From b7743970b054a08acf6445cc6d10838e60cdb639 Mon Sep 17 00:00:00 2001 From: Deepak Saxena Date: Tue, 1 Nov 2011 14:25:07 -0700 Subject: time: x86: Remove CLOCK_TICK_RATE from tsc code The tsc code uses CLOCK_TICK_RATE which on x86 is defined to just be the same as PIT_TICK_RATE. This patch updates the code use the later as we want to depecrate and remove the global CLOCK_TICK_RATE symbol. Signed-off-by: Deepak Saxena Signed-off-by: John Stultz --- arch/x86/kernel/tsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index db48336..1e88c8e 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -178,11 +178,11 @@ static unsigned long calc_pmtimer_ref(u64 deltatsc, u64 pm1, u64 pm2) } #define CAL_MS 10 -#define CAL_LATCH (CLOCK_TICK_RATE / (1000 / CAL_MS)) +#define CAL_LATCH (PIT_TICK_RATE / (1000 / CAL_MS)) #define CAL_PIT_LOOPS 1000 #define CAL2_MS 50 -#define CAL2_LATCH (CLOCK_TICK_RATE / (1000 / CAL2_MS)) +#define CAL2_LATCH (PIT_TICK_RATE / (1000 / CAL2_MS)) #define CAL2_PIT_LOOPS 5000 -- cgit v1.1 From e5fd47bfab2df0c2184cc0bf4245d8e1bb7724fb Mon Sep 17 00:00:00 2001 From: Konrad Rzeszutek Wilk Date: Mon, 21 Nov 2011 18:02:02 -0500 Subject: xen/pm_idle: Make pm_idle be default_idle under Xen. The idea behind commit d91ee5863b71 ("cpuidle: replace xen access to x86 pm_idle and default_idle") was to have one call - disable_cpuidle() which would make pm_idle not be molested by other code. It disallows cpuidle_idle_call to be set to pm_idle (which is excellent). But in the select_idle_routine() and idle_setup(), the pm_idle can still be set to either: amd_e400_idle, mwait_idle or default_idle. This depends on some CPU flags (MWAIT) and in AMD case on the type of CPU. In case of mwait_idle we can hit some instances where the hypervisor (Amazon EC2 specifically) sets the MWAIT and we get: Brought up 2 CPUs invalid opcode: 0000 [#1] SMP Pid: 0, comm: swapper Not tainted 3.1.0-0.rc6.git0.3.fc16.x86_64 #1 RIP: e030:[] [] mwait_idle+0x6f/0xb4 ... Call Trace: [] cpu_idle+0xae/0xe8 [] cpu_bringup_and_idle+0xe/0x10 RIP [] mwait_idle+0x6f/0xb4 RSP In the case of amd_e400_idle we don't get so spectacular crashes, but we do end up making an MSR which is trapped in the hypervisor, and then follow it up with a yield hypercall. Meaning we end up going to hypervisor twice instead of just once. The previous behavior before v3.0 was that pm_idle was set to default_idle regardless of select_idle_routine/idle_setup. We want to do that, but only for one specific case: Xen. This patch does that. Fixes RH BZ #739499 and Ubuntu #881076 Reported-by: Stefan Bader Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Linus Torvalds --- arch/x86/kernel/process.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index b9b3b1a..ee5d4fb 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -403,6 +403,14 @@ void default_idle(void) EXPORT_SYMBOL(default_idle); #endif +bool set_pm_idle_to_default(void) +{ + bool ret = !!pm_idle; + + pm_idle = default_idle; + + return ret; +} void stop_this_cpu(void *dummy) { local_irq_disable(); -- cgit v1.1 From 8e8da023f5af71662867729db5547dc54786093c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 4 Dec 2011 11:57:09 -0800 Subject: x86: Fix boot failures on older AMD CPU's People with old AMD chips are getting hung boots, because commit bcb80e53877c ("x86, microcode, AMD: Add microcode revision to /proc/cpuinfo") moved the microcode detection too early into "early_init_amd()". At that point we are *so* early in the booth that the exception tables haven't even been set up yet, so the whole rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); doesn't actually work: if the rdmsr does a GP fault (due to non-existant MSR register on older CPU's), we can't fix it up yet, and the boot fails. Fix it by simply moving the code to a slightly later point in the boot (init_amd() instead of early_init_amd()), since the kernel itself doesn't even really care about the microcode patchlevel at this point (or really ever: it's made available to user space in /proc/cpuinfo, and updated if you do a microcode load). Reported-tested-and-bisected-by: Larry Finger Tested-by: Bob Tracy Acked-by: Borislav Petkov Cc: Ingo Molnar Cc: Srivatsa S. Bhat Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/amd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index c7e46cb..0bab2b1 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -442,8 +442,6 @@ static void __cpuinit bsp_init_amd(struct cpuinfo_x86 *c) static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) { - u32 dummy; - early_init_amd_mc(c); /* @@ -473,12 +471,12 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) set_cpu_cap(c, X86_FEATURE_EXTD_APICID); } #endif - - rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); } static void __cpuinit init_amd(struct cpuinfo_x86 *c) { + u32 dummy; + #ifdef CONFIG_SMP unsigned long long value; @@ -657,6 +655,8 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) checking_wrmsrl(MSR_AMD64_MCx_MASK(4), mask); } } + + rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); } #ifdef CONFIG_X86_32 -- cgit v1.1 From 6a600a8b8749566a7d81ad75dcb8bf5342b5a39a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 15 Nov 2011 10:51:15 +0100 Subject: perf, x86: Disable PEBS on SandyBridge chips Cc: Stephane Eranian Cc: stable@kernel.org Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 2be5ebe..8d601b1 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1545,6 +1545,13 @@ static void intel_clovertown_quirks(void) x86_pmu.pebs_constraints = NULL; } +static void intel_sandybridge_quirks(void) +{ + printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); + x86_pmu.pebs = 0; + x86_pmu.pebs_constraints = NULL; +} + __init int intel_pmu_init(void) { union cpuid10_edx edx; @@ -1694,6 +1701,7 @@ __init int intel_pmu_init(void) break; case 42: /* SandyBridge */ + x86_pmu.quirks = intel_sandybridge_quirks; case 45: /* SandyBridge, "Romely-EP" */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); -- cgit v1.1 From 16e5294e5f8303756a179cf218e37dfb9ed34417 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Tue, 8 Nov 2011 19:20:44 +0100 Subject: perf, x86: Force IBS LVT offset assignment for family 10h On AMD family 10h we see firmware bug messages like the following: [Firmware Bug]: cpu 6, try to use APIC500 (LVT offset 0) for vector 0x10400, but the register is already in use for vector 0xf9 on another cpu [Firmware Bug]: cpu 6, IBS interrupt offset 0 not available (MSRC001103A=0x0000000000000100) [Firmware Bug]: using offset 1 for IBS interrupts [Firmware Bug]: workaround enabled for IBS LVT offset perf: AMD IBS detected (0x00000007) We always see this, since the offsets are not assigned by the BIOS for this family. Force LVT offset assignment in this case. If the OS assignment fails, fallback to BIOS settings and try to setup this. The fallback to BIOS settings weakens the family check since force_ibs_eilvt_setup() may fail e.g. in case of virtual machines. But setup may still succeed if BIOS offsets are correct. Other families don't have a workaround implemented that assigns LVT offsets. It's ok, to drop calling force_ibs_eilvt_setup() for that families. With the patch the [Firmware Bug] messages vanish. We see now: IBS: LVT offset 1 assigned perf: AMD IBS detected (0x00000007) Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20111109162225.GO12451@erda.amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd_ibs.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_amd_ibs.c b/arch/x86/kernel/cpu/perf_event_amd_ibs.c index ab6343d..3b8a2d3 100644 --- a/arch/x86/kernel/cpu/perf_event_amd_ibs.c +++ b/arch/x86/kernel/cpu/perf_event_amd_ibs.c @@ -199,8 +199,7 @@ static int force_ibs_eilvt_setup(void) goto out; } - pr_err(FW_BUG "using offset %d for IBS interrupts\n", offset); - pr_err(FW_BUG "workaround enabled for IBS LVT offset\n"); + pr_info("IBS: LVT offset %d assigned\n", offset); return 0; out: @@ -265,19 +264,23 @@ perf_ibs_cpu_notifier(struct notifier_block *self, unsigned long action, void *h static __init int amd_ibs_init(void) { u32 caps; - int ret; + int ret = -EINVAL; caps = __get_ibs_caps(); if (!caps) return -ENODEV; /* ibs not supported by the cpu */ - if (!ibs_eilvt_valid()) { - ret = force_ibs_eilvt_setup(); - if (ret) { - pr_err("Failed to setup IBS, %d\n", ret); - return ret; - } - } + /* + * Force LVT offset assignment for family 10h: The offsets are + * not assigned by the BIOS for this family, so the OS is + * responsible for doing it. If the OS assignment fails, fall + * back to BIOS settings and try to setup this. + */ + if (boot_cpu_data.x86 == 0x10) + force_ibs_eilvt_setup(); + + if (!ibs_eilvt_valid()) + goto out; get_online_cpus(); ibs_caps = caps; @@ -287,7 +290,11 @@ static __init int amd_ibs_init(void) smp_call_function(setup_APIC_ibs, NULL, 1); put_online_cpus(); - return perf_event_ibs_init(); + ret = perf_event_ibs_init(); +out: + if (ret) + pr_err("Failed to setup IBS, %d\n", ret); + return ret; } /* Since we need the pci subsystem to init ibs we can't do this earlier: */ -- cgit v1.1 From 69682b625a043b567873e6cda397969b502f0054 Mon Sep 17 00:00:00 2001 From: Mitsuo Hayasaka Date: Tue, 29 Nov 2011 15:08:21 +0900 Subject: x86: Add user_mode_vm check in stack_overflow_check The kernel stack overflow is checked in stack_overflow_check(), which may wrongly detect the overflow if the stack pointer in user space points to the kernel stack intentionally or accidentally. So, the actual overflow is never detected after this misdetection because WARN_ONCE() is used on the detection of it. This patch adds user-mode-vm checking before it to avoid this problem and bails out early if the user stack is used. Signed-off-by: Mitsuo Hayasaka Cc: yrl.pp-manager.tt@hitachi.com Cc: Randy Dunlap Link: http://lkml.kernel.org/r/20111129060821.11076.55315.stgit@ltc219.sdl.hitachi.co.jp Signed-off-by: Ingo Molnar Cc: "H. Peter Anvin" --- arch/x86/kernel/irq_64.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index acf8fbf..69bca46 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -38,6 +38,9 @@ static inline void stack_overflow_check(struct pt_regs *regs) #ifdef CONFIG_DEBUG_STACKOVERFLOW u64 curbase = (u64)task_stack_page(current); + if (user_mode_vm(regs)) + return; + WARN_ONCE(regs->sp >= curbase && regs->sp <= curbase + THREAD_SIZE && regs->sp < curbase + sizeof(struct thread_info) + -- cgit v1.1 From b495e039b4ce2ce4a96b3006004faf082f4d50e2 Mon Sep 17 00:00:00 2001 From: Jack Steiner Date: Tue, 29 Nov 2011 15:00:58 -0600 Subject: x86, UV: Fix UV2 hub part number There was a mixup when the SGI UV2 hub chip was sent to be fabricated, and it ended up with the wrong part number in the HRP_NODE_ID mmr. Future versions of the chip will (may) have the correct part number. Change the UV infrastructure to recognize both part numbers as valid IDs of a UV2 hub chip. Signed-off-by: Jack Steiner Link: http://lkml.kernel.org/r/20111129210058.GA20452@sgi.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_uv_x.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 62ae300..9d59bba 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -93,6 +93,8 @@ static int __init early_get_pnodeid(void) if (node_id.s.part_number == UV2_HUB_PART_NUMBER) uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1; + if (node_id.s.part_number == UV2_HUB_PART_NUMBER_X) + uv_min_hub_revision_id += UV2_HUB_REVISION_BASE - 1; uv_hub_info->hub_revision = uv_min_hub_revision_id; pnode = (node_id.s.node_id >> 1) & ((1 << m_n_config.s.n_skt) - 1); -- cgit v1.1 From 4fc3490114bb159bd4fff1b3c96f4320fe6fb08f Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 7 Nov 2011 16:33:40 -0800 Subject: x86-64: Set siginfo and context on vsyscall emulation faults To make this work, we teach the page fault handler how to send signals on failed uaccess. This only works for user addresses (kernel addresses will never hit the page fault handler in the first place), so we need to generate signals for those separately. This gets the tricky case right: if the user buffer spans multiple pages and only the second page is invalid, we set cr2 and si_addr correctly. UML relies on this behavior to "fault in" pages as needed. We steal a bit from thread_info.uaccess_err to enable this. Before this change, uaccess_err was a 32-bit boolean value. This fixes issues with UML when vsyscall=emulate. Reported-by: Adrian Bunk Signed-off-by: Andy Lutomirski Cc: richard -rw- weinberger Cc: H. Peter Anvin Cc: Linus Torvalds Link: http://lkml.kernel.org/r/4c8f91de7ec5cd2ef0f59521a04e1015f11e42b4.1320712291.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsyscall_64.c | 75 ++++++++++++++++++++++++++++++++++++++----- 1 file changed, 67 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index e4d4a22..8084bec 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -140,11 +140,40 @@ static int addr_to_vsyscall_nr(unsigned long addr) return nr; } +static bool write_ok_or_segv(unsigned long ptr, size_t size) +{ + /* + * XXX: if access_ok, get_user, and put_user handled + * sig_on_uaccess_error, this could go away. + */ + + if (!access_ok(VERIFY_WRITE, (void __user *)ptr, size)) { + siginfo_t info; + struct thread_struct *thread = ¤t->thread; + + thread->error_code = 6; /* user fault, no page, write */ + thread->cr2 = ptr; + thread->trap_no = 14; + + memset(&info, 0, sizeof(info)); + info.si_signo = SIGSEGV; + info.si_errno = 0; + info.si_code = SEGV_MAPERR; + info.si_addr = (void __user *)ptr; + + force_sig_info(SIGSEGV, &info, current); + return false; + } else { + return true; + } +} + bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) { struct task_struct *tsk; unsigned long caller; int vsyscall_nr; + int prev_sig_on_uaccess_error; long ret; /* @@ -180,35 +209,65 @@ bool emulate_vsyscall(struct pt_regs *regs, unsigned long address) if (seccomp_mode(&tsk->seccomp)) do_exit(SIGKILL); + /* + * With a real vsyscall, page faults cause SIGSEGV. We want to + * preserve that behavior to make writing exploits harder. + */ + prev_sig_on_uaccess_error = current_thread_info()->sig_on_uaccess_error; + current_thread_info()->sig_on_uaccess_error = 1; + + /* + * 0 is a valid user pointer (in the access_ok sense) on 32-bit and + * 64-bit, so we don't need to special-case it here. For all the + * vsyscalls, 0 means "don't write anything" not "write it at + * address 0". + */ + ret = -EFAULT; switch (vsyscall_nr) { case 0: + if (!write_ok_or_segv(regs->di, sizeof(struct timeval)) || + !write_ok_or_segv(regs->si, sizeof(struct timezone))) + break; + ret = sys_gettimeofday( (struct timeval __user *)regs->di, (struct timezone __user *)regs->si); break; case 1: + if (!write_ok_or_segv(regs->di, sizeof(time_t))) + break; + ret = sys_time((time_t __user *)regs->di); break; case 2: + if (!write_ok_or_segv(regs->di, sizeof(unsigned)) || + !write_ok_or_segv(regs->si, sizeof(unsigned))) + break; + ret = sys_getcpu((unsigned __user *)regs->di, (unsigned __user *)regs->si, 0); break; } + current_thread_info()->sig_on_uaccess_error = prev_sig_on_uaccess_error; + if (ret == -EFAULT) { - /* - * Bad news -- userspace fed a bad pointer to a vsyscall. - * - * With a real vsyscall, that would have caused SIGSEGV. - * To make writing reliable exploits using the emulated - * vsyscalls harder, generate SIGSEGV here as well. - */ + /* Bad news -- userspace fed a bad pointer to a vsyscall. */ warn_bad_vsyscall(KERN_INFO, regs, "vsyscall fault (exploit attempt?)"); - goto sigsegv; + + /* + * If we failed to generate a signal for any reason, + * generate one here. (This should be impossible.) + */ + if (WARN_ON_ONCE(!sigismember(&tsk->pending.signal, SIGBUS) && + !sigismember(&tsk->pending.signal, SIGSEGV))) + goto sigsegv; + + return true; /* Don't emulate the ret. */ } regs->ax = ret; -- cgit v1.1 From 2e57ae0515124af45dd889bfbd4840fd40fcc07d Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 7 Nov 2011 16:33:41 -0800 Subject: x86: Default to vsyscall=emulate This essentially reverts: 2b666859ec32: x86: Default to vsyscall=native for now The ABI breakage should now be fixed by: commit 48c4206f5b02f28c4c78a1f5b491d3772fb64fb9 Author: Andy Lutomirski Date: Thu Oct 20 08:48:19 2011 -0700 x86-64: Set siginfo and context on vsyscall emulation faults Signed-off-by: Andy Lutomirski Cc: richard -rw- weinberger Cc: Adrian Bunk Cc: H. Peter Anvin Cc: Linus Torvalds Link: http://lkml.kernel.org/r/93154af3b2b6d208906ae02d80d92cf60c6fa94f.1320712291.git.luto@amacapital.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/vsyscall_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/vsyscall_64.c b/arch/x86/kernel/vsyscall_64.c index 8084bec..b07ba93 100644 --- a/arch/x86/kernel/vsyscall_64.c +++ b/arch/x86/kernel/vsyscall_64.c @@ -57,7 +57,7 @@ DEFINE_VVAR(struct vsyscall_gtod_data, vsyscall_gtod_data) = .lock = __SEQLOCK_UNLOCKED(__vsyscall_gtod_data.lock), }; -static enum { EMULATE, NATIVE, NONE } vsyscall_mode = NATIVE; +static enum { EMULATE, NATIVE, NONE } vsyscall_mode = EMULATE; static int __init vsyscall_setup(char *str) { -- cgit v1.1 From 6be30bb7d7504ec687a65c9bbdae8d1d2f8eaa19 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Wed, 16 Nov 2011 00:19:51 +0100 Subject: x86/reboot: Blacklist Dell OptiPlex 990 known to require PCI reboot Dell OptiPlex 990 is known to require PCI reboot, so add it to the reboot blacklist in pci_reboot_dmi_table[]. Signed-off-by: Rafael J. Wysocki Link: http://lkml.kernel.org/r/201111160019.51303.rjw@sisk.pl Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index e334be1..4463d2f 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -443,6 +443,14 @@ static struct dmi_system_id __initdata pci_reboot_dmi_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6420"), }, }, + { /* Handle problems with rebooting on the OptiPlex 990. */ + .callback = set_pci_reboot, + .ident = "Dell OptiPlex 990", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), + }, + }, { } }; -- cgit v1.1 From 9e6866686bdf2dcf3aeb0838076237ede532dcc8 Mon Sep 17 00:00:00 2001 From: Bjorn Helgaas Date: Sun, 25 Sep 2011 15:29:00 -0600 Subject: x86/mpparse: Account for bus types other than ISA and PCI In commit f8924e770e04 ("x86: unify mp_bus_info"), the 32-bit and 64-bit versions of MP_bus_info were rearranged to match each other better. Unfortunately it introduced a regression: prior to that change we used to always set the mp_bus_not_pci bit, then clear it if we found a PCI bus. After it, we set mp_bus_not_pci for ISA buses, clear it for PCI buses, and leave it alone otherwise. In the cases of ISA and PCI, there's not much difference. But ISA is not the only non-PCI bus, so it's better to always set mp_bus_not_pci and clear it only for PCI. Without this change, Dan's Dell PowerEdge 4200 panics on boot with a log indicating interrupt routing trouble unless the "noapic" option is supplied. With this change, the machine boots reliably without "noapic". Fixes http://bugs.debian.org/586494 Reported-bisected-and-tested-by: Dan McGrath Signed-off-by: Bjorn Helgaas Cc: stable@vger.kernel.org # 2.6.26+ Cc: Dan McGrath Cc: Alexey Starikovskiy [jrnieder@gmail.com: clarified commit message] Signed-off-by: Jonathan Nieder Link: http://lkml.kernel.org/r/20111122215000.GA9151@elie.hsd1.il.comcast.net Signed-off-by: Ingo Molnar --- arch/x86/kernel/mpparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/mpparse.c b/arch/x86/kernel/mpparse.c index 9103b89..0741b062 100644 --- a/arch/x86/kernel/mpparse.c +++ b/arch/x86/kernel/mpparse.c @@ -95,8 +95,8 @@ static void __init MP_bus_info(struct mpc_bus *m) } #endif + set_bit(m->busid, mp_bus_not_pci); if (strncmp(str, BUSTYPE_ISA, sizeof(BUSTYPE_ISA) - 1) == 0) { - set_bit(m->busid, mp_bus_not_pci); #if defined(CONFIG_EISA) || defined(CONFIG_MCA) mp_bus_id_to_type[m->busid] = MP_BUS_ISA; #endif -- cgit v1.1 From 644ddf588f5dba34df483a6ea8abe639cc102289 Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Tue, 18 Oct 2011 13:24:10 -0400 Subject: Add TAINT_FIRMWARE_WORKAROUND on MTRR fixup TAINT_FIRMWARE_WORKAROUND should be set when an MTRR fixup is done. Signed-off-by: Prarit Bhargava Acked-by: David Rientjes Link: http://lkml.kernel.org/r/1318958650-12447-1-git-send-email-prarit@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index a71efcdb..e1fe7f4 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -547,6 +547,7 @@ static void generic_get_mtrr(unsigned int reg, unsigned long *base, if (tmp != mask_lo) { printk(KERN_WARNING "mtrr: your BIOS has configured an incorrect mask, fixing it.\n"); + add_taint(TAINT_FIRMWARE_WORKAROUND); mask_lo = tmp; } } -- cgit v1.1 From 3f7787b36cf2d99f3dbc8a0be85b92a5530a9a76 Mon Sep 17 00:00:00 2001 From: Ferenc Wagner Date: Fri, 18 Nov 2011 15:28:22 +0100 Subject: x86: Replace the EVT_TO_HPET_DEV() macro with an inline function The original macro worked only when applied to variables named 'evt'. While this could have been fixed by simply renaming the macro argument, a more type-safe replacement is preferred. Signed-off-by: Ferenc Wagner Cc: Venkatesh Pallipadi \(Venki\) Link: http://lkml.kernel.org/r/8ed5c66c02041226e8cf8b4d5d6b41e543d90bd6.1321626272.git.wferi@niif.hu Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index b946a9e..52aae9a 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -32,8 +32,6 @@ #define HPET_MIN_CYCLES 128 #define HPET_MIN_PROG_DELTA (HPET_MIN_CYCLES + (HPET_MIN_CYCLES >> 1)) -#define EVT_TO_HPET_DEV(evt) container_of(evt, struct hpet_dev, evt) - /* * HPET address is set in acpi/boot.c, when an ACPI entry exists */ @@ -55,6 +53,11 @@ struct hpet_dev { char name[10]; }; +inline struct hpet_dev *EVT_TO_HPET_DEV(struct clock_event_device *evtdev) +{ + return container_of(evtdev, struct hpet_dev, evt); +} + inline unsigned int hpet_readl(unsigned int a) { return readl(hpet_virt_address + a); -- cgit v1.1 From cced40229993bb63238299e48a22e4c8d1e13181 Mon Sep 17 00:00:00 2001 From: Thomas Meyer Date: Thu, 17 Nov 2011 23:43:40 +0100 Subject: x86: Use kmemdup() in copy_thread(), rather than duplicating its implementation The semantic patch that makes this change is available in scripts/coccinelle/api/memdup.cocci. Signed-off-by: Thomas Meyer Link: http://lkml.kernel.org/r/1321569820.1624.275.camel@localhost.localdomain Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_64.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3bd7e6e..d2c1f62 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -293,13 +293,12 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, memset(p->thread.ptrace_bps, 0, sizeof(p->thread.ptrace_bps)); if (unlikely(test_tsk_thread_flag(me, TIF_IO_BITMAP))) { - p->thread.io_bitmap_ptr = kmalloc(IO_BITMAP_BYTES, GFP_KERNEL); + p->thread.io_bitmap_ptr = kmemdup(me->thread.io_bitmap_ptr, + IO_BITMAP_BYTES, GFP_KERNEL); if (!p->thread.io_bitmap_ptr) { p->thread.io_bitmap_max = 0; return -ENOMEM; } - memcpy(p->thread.io_bitmap_ptr, me->thread.io_bitmap_ptr, - IO_BITMAP_BYTES); set_tsk_thread_flag(p, TIF_IO_BITMAP); } -- cgit v1.1 From bd399063976c6c7a09beb4730ed1d93cadbcc739 Mon Sep 17 00:00:00 2001 From: "Srivatsa S. Bhat" Date: Mon, 7 Nov 2011 18:05:32 +0530 Subject: x86, microcode: Fix the failure path of microcode update driver init code The microcode update driver's initialization code does not handle failures correctly. This patch fixes this issue. Signed-off-by: Jan Beulich Signed-off-by: Srivatsa S. Bhat Link: http://lkml.kernel.org/r/20111107123530.12164.31227.stgit@srivatsabhat.in.ibm.com Link: http://lkml.kernel.org/r/4ED8E2270200007800065120@nat28.tlf.novell.com Signed-off-by: Borislav Petkov --- arch/x86/kernel/microcode_core.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index f2d2a66..9d46f5e 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -256,7 +256,7 @@ static int __init microcode_dev_init(void) return 0; } -static void microcode_dev_exit(void) +static void __exit microcode_dev_exit(void) { misc_deregister(µcode_dev); } @@ -519,10 +519,8 @@ static int __init microcode_init(void) microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0); - if (IS_ERR(microcode_pdev)) { - microcode_dev_exit(); + if (IS_ERR(microcode_pdev)) return PTR_ERR(microcode_pdev); - } get_online_cpus(); mutex_lock(µcode_mutex); @@ -532,14 +530,12 @@ static int __init microcode_init(void) mutex_unlock(µcode_mutex); put_online_cpus(); - if (error) { - platform_device_unregister(microcode_pdev); - return error; - } + if (error) + goto out_pdev; error = microcode_dev_init(); if (error) - return error; + goto out_sysdev_driver; register_syscore_ops(&mc_syscore_ops); register_hotcpu_notifier(&mc_cpu_notifier); @@ -548,6 +544,20 @@ static int __init microcode_init(void) " , Peter Oruba\n"); return 0; + +out_sysdev_driver: + get_online_cpus(); + mutex_lock(µcode_mutex); + + sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + + mutex_unlock(µcode_mutex); + put_online_cpus(); + +out_pdev: + platform_device_unregister(microcode_pdev); + return error; + } module_init(microcode_init); -- cgit v1.1 From 8dbf4a30033ff61091015f0076e872b5c8f717cc Mon Sep 17 00:00:00 2001 From: Ajaykumar Hotchandani Date: Fri, 11 Nov 2011 18:31:57 +0530 Subject: x86/mtrr: Resolve inconsistency with Intel processor manual Following is from Notes of section 11.5.3 of Intel processor manual available at: http://www.intel.com/Assets/PDF/manual/325384.pdf For the Pentium 4 and Intel Xeon processors, after the sequence of steps given above has been executed, the cache lines containing the code between the end of the WBINVD instruction and before the MTRRS have actually been disabled may be retained in the cache hierarchy. Here, to remove code from the cache completely, a second WBINVD instruction must be executed after the MTRRs have been disabled. This patch provides resolution for that. Ideally, I will like to make changes only for Pentium 4 and Xeon processors. But, I am not finding easier way to do it. And, extra wbinvd() instruction does not hurt much for other processors. Signed-off-by: Ajaykumar Hotchandani Cc: Linus Torvalds Cc: Arjan van de Ven Cc: Lucas De Marchi Link: http://lkml.kernel.org/r/4EBD1CC5.3030008@oracle.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mtrr/generic.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index e1fe7f4..97b2635 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -694,6 +694,7 @@ static void prepare_set(void) __acquires(set_atomicity_lock) /* Disable MTRRs, and set the default type to uncached */ mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); + wbinvd(); } static void post_set(void) __releases(set_atomicity_lock) -- cgit v1.1 From 1ef03890969932e9359b9a4c658f7f87771910ac Mon Sep 17 00:00:00 2001 From: Peter Chubb Date: Mon, 5 Dec 2011 16:53:53 +0300 Subject: x86: Fix "Acer Aspire 1" reboot hang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Looks like on some Acer Aspire 1s with older bioses, reboot via bios fails. It works on my machine, (with BIOS version 0.3310) but not on some others (BIOS version 0.3309). There's a log of problems at: https://bbs.archlinux.org/viewtopic.php?id=124136 This patch adds a different callback to the reboot quirk table, to allow rebooting via keybaord controller. Reported-by: Uroš Vampl Tested-by: Vasily Khoruzhick Signed-off-by: Peter Chubb Cc: Don Zickus Cc: Peter Zijlstra Cc: stable@kernel.org Link: http://lkml.kernel.org/r/1323093233-9481-1-git-send-email-anarsoul@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/reboot.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 4463d2f..37a458b 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -124,7 +124,7 @@ __setup("reboot=", reboot_setup); */ /* - * Some machines require the "reboot=b" commandline option, + * Some machines require the "reboot=b" or "reboot=k" commandline options, * this quirk makes that automatic. */ static int __init set_bios_reboot(const struct dmi_system_id *d) @@ -136,6 +136,15 @@ static int __init set_bios_reboot(const struct dmi_system_id *d) return 0; } +static int __init set_kbd_reboot(const struct dmi_system_id *d) +{ + if (reboot_type != BOOT_KBD) { + reboot_type = BOOT_KBD; + printk(KERN_INFO "%s series board detected. Selecting KBD-method for reboot.\n", d->ident); + } + return 0; +} + static struct dmi_system_id __initdata reboot_dmi_table[] = { { /* Handle problems with rebooting on Dell E520's */ .callback = set_bios_reboot, @@ -295,7 +304,7 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { }, }, { /* Handle reboot issue on Acer Aspire one */ - .callback = set_bios_reboot, + .callback = set_kbd_reboot, .ident = "Acer Aspire One A110", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Acer"), -- cgit v1.1 From 98b8b99ae1233a5408b136544e900a9cfb46e08f Mon Sep 17 00:00:00 2001 From: H Hartley Sweeten Date: Tue, 15 Nov 2011 14:48:58 -0800 Subject: arch/x86/kernel/ptrace.c: Quiet sparse noise ptrace_set_debugreg() is only used in this file and should be static. This also quiets the following sparse warning: warning: symbol 'ptrace_set_debugreg' was not declared. Should it be static? Signed-off-by: H Hartley Sweeten Signed-off-by: Andrew Morton Cc: hartleys@visionengravers.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 8252879..89a04c7 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -749,7 +749,8 @@ put: /* * Handle PTRACE_POKEUSR calls for the debug register area. */ -int ptrace_set_debugreg(struct task_struct *tsk, int n, unsigned long val) +static int ptrace_set_debugreg(struct task_struct *tsk, int n, + unsigned long val) { struct thread_struct *thread = &(tsk->thread); int rc = 0; -- cgit v1.1 From 35d476996288af6a4aaa8b172bcd31decd233de7 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Tue, 15 Nov 2011 14:46:52 -0800 Subject: x86/rtc, mrst: Don't register a platform RTC device for for Intel MID platforms Intel MID x86 platforms have a memory mapped virtual RTC instead. No MID platform have the default ports (and accessing them may do weird stuff). Signed-off-by: Mathias Nyman Signed-off-by: Alan Cox Cc: feng.tang@intel.com Cc: Feng Tang Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- arch/x86/kernel/rtc.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c index 348ce01..af6db6e 100644 --- a/arch/x86/kernel/rtc.c +++ b/arch/x86/kernel/rtc.c @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_X86_32 /* @@ -242,6 +243,10 @@ static __init int add_rtc_cmos(void) if (of_have_populated_dt()) return 0; + /* Intel MID platforms don't have ioport rtc */ + if (mrst_identify_cpu()) + return -ENODEV; + platform_device_register(&rtc_device); dev_info(&rtc_device.dev, "registered platform RTC device (no PNP device found)\n"); -- cgit v1.1 From 9a0ebfbe3f1007008d198ccc6b86783cdb312fec Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Mon, 5 Dec 2011 16:20:36 +0800 Subject: x86: Make flat_init_apic_ldr() available Allow flat_init_apic_ldr() to be used outside the compilation unit for similar APIC implementations. Signed-off-by: Daniel J Blueman Cc: Steffen Persvold Cc: Jesse Barnes Link: http://lkml.kernel.org/r/1323073238-32686-1-git-send-email-daniel@numascale-asia.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic_flat_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index f7a41e4..57c1f41 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -62,7 +62,7 @@ static void flat_vector_allocation_domain(int cpu, struct cpumask *retmask) * an APIC. See e.g. "AP-388 82489DX User's Manual" (Intel * document number 292116). So here it goes... */ -static void flat_init_apic_ldr(void) +void flat_init_apic_ldr(void) { unsigned long val; unsigned long num, id; -- cgit v1.1 From 64be4c1c2428e148de6081af235e2418e6a66dda Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Mon, 5 Dec 2011 16:20:37 +0800 Subject: x86: Add x86_init platform override to fix up NUMA core numbering Add an x86_init vector for handling inconsistent core numbering. This is useful for multi-fabric platforms, such as Numascale NumaConnect. v2: - use struct x86_cpuinit_ops - provide default fall-back function to warn Signed-off-by: Daniel J Blueman Cc: Steffen Persvold Cc: Jesse Barnes Link: http://lkml.kernel.org/r/1323073238-32686-2-git-send-email-daniel@numascale-asia.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 7 +++++++ arch/x86/kernel/cpu/common.c | 9 +++++++++ arch/x86/kernel/x86_init.c | 1 + 3 files changed, 17 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 0bab2b1..ef21bdc 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -353,6 +353,13 @@ static void __cpuinit srat_detect_node(struct cpuinfo_x86 *c) if (node == NUMA_NO_NODE) node = per_cpu(cpu_llc_id, cpu); + /* + * If core numbers are inconsistent, it's likely a multi-fabric platform, + * so invoke platform-specific handler + */ + if (c->phys_proc_id != node) + x86_cpuinit.fixup_cpu_id(c, node); + if (!node_online(node)) { /* * Two possibilities here: diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index aa003b1..ad4da45 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1141,6 +1141,15 @@ static void dbg_restore_debug_regs(void) #endif /* ! CONFIG_KGDB */ /* + * Prints an error where the NUMA and configured core-number mismatch and the + * platform didn't override this to fix it up + */ +void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node) +{ + pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id); +} + +/* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT * and IDT. We reload them nevertheless, this function acts as a diff --git a/arch/x86/kernel/x86_init.c b/arch/x86/kernel/x86_init.c index c1d6cd5..91f83e2 100644 --- a/arch/x86/kernel/x86_init.c +++ b/arch/x86/kernel/x86_init.c @@ -92,6 +92,7 @@ struct x86_init_ops x86_init __initdata = { struct x86_cpuinit_ops x86_cpuinit __cpuinitdata = { .setup_percpu_clockev = setup_secondary_APIC_clock, + .fixup_cpu_id = x86_default_fixup_cpu_id, }; static void default_nmi_init(void) { }; -- cgit v1.1 From 44b111b519160e33fdc41eadb39af86a24707edf Mon Sep 17 00:00:00 2001 From: Steffen Persvold Date: Tue, 6 Dec 2011 00:07:26 +0800 Subject: x86: Add NumaChip support Adds support for Numascale NumaChip large-SMP systems. It is needed to enable the booting of more than ~168 cores. v2: - [Steffen] enumerate only accessible northbridges - [Daniel] rediffed and validated against 3.1-rc10 v3: - [Daniel] use x86_init core numbering override - [Daniel] cleanups as per feedback v4: - [Daniel] use updated x86_cpuinit override v5: - drop disabling interrupts locally, as ISR write is atomic; drop delay - added read-mostly annotations where appropriate - require CONFIG_SMP, so drop conditional path Workload tested on 96 cores/16 sockets. Signed-off-by: Steffen Persvold Signed-off-by: Daniel J Blueman Cc: Jesse Barnes Link: http://lkml.kernel.org/r/1323101246-2400-1-git-send-email-daniel@numascale-asia.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/Makefile | 1 + arch/x86/kernel/apic/apic_numachip.c | 294 +++++++++++++++++++++++++++++++++++ 2 files changed, 295 insertions(+) create mode 100644 arch/x86/kernel/apic/apic_numachip.c (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/Makefile b/arch/x86/kernel/apic/Makefile index 767fd04..0ae0323 100644 --- a/arch/x86/kernel/apic/Makefile +++ b/arch/x86/kernel/apic/Makefile @@ -10,6 +10,7 @@ obj-$(CONFIG_SMP) += ipi.o ifeq ($(CONFIG_X86_64),y) # APIC probe will depend on the listing order here +obj-$(CONFIG_X86_NUMACHIP) += apic_numachip.o obj-$(CONFIG_X86_UV) += x2apic_uv_x.o obj-$(CONFIG_X86_X2APIC) += x2apic_phys.o obj-$(CONFIG_X86_X2APIC) += x2apic_cluster.o diff --git a/arch/x86/kernel/apic/apic_numachip.c b/arch/x86/kernel/apic/apic_numachip.c new file mode 100644 index 0000000..09d3d8c --- /dev/null +++ b/arch/x86/kernel/apic/apic_numachip.c @@ -0,0 +1,294 @@ +/* + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + * + * Numascale NumaConnect-Specific APIC Code + * + * Copyright (C) 2011 Numascale AS. All rights reserved. + * + * Send feedback to + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static int numachip_system __read_mostly; + +static struct apic apic_numachip __read_mostly; + +static unsigned int get_apic_id(unsigned long x) +{ + unsigned long value; + unsigned int id; + + rdmsrl(MSR_FAM10H_NODE_ID, value); + id = ((x >> 24) & 0xffU) | ((value << 2) & 0x3f00U); + + return id; +} + +static unsigned long set_apic_id(unsigned int id) +{ + unsigned long x; + + x = ((id & 0xffU) << 24); + return x; +} + +static unsigned int read_xapic_id(void) +{ + return get_apic_id(apic_read(APIC_ID)); +} + +static int numachip_apic_id_registered(void) +{ + return physid_isset(read_xapic_id(), phys_cpu_present_map); +} + +static int numachip_phys_pkg_id(int initial_apic_id, int index_msb) +{ + return initial_apic_id >> index_msb; +} + +static const struct cpumask *numachip_target_cpus(void) +{ + return cpu_online_mask; +} + +static void numachip_vector_allocation_domain(int cpu, struct cpumask *retmask) +{ + cpumask_clear(retmask); + cpumask_set_cpu(cpu, retmask); +} + +static int __cpuinit numachip_wakeup_secondary(int phys_apicid, unsigned long start_rip) +{ + union numachip_csr_g3_ext_irq_gen int_gen; + + int_gen.s._destination_apic_id = phys_apicid; + int_gen.s._vector = 0; + int_gen.s._msgtype = APIC_DM_INIT >> 8; + int_gen.s._index = 0; + + write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); + + int_gen.s._msgtype = APIC_DM_STARTUP >> 8; + int_gen.s._vector = start_rip >> 12; + + write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); + + atomic_set(&init_deasserted, 1); + return 0; +} + +static void numachip_send_IPI_one(int cpu, int vector) +{ + union numachip_csr_g3_ext_irq_gen int_gen; + int apicid = per_cpu(x86_cpu_to_apicid, cpu); + + int_gen.s._destination_apic_id = apicid; + int_gen.s._vector = vector; + int_gen.s._msgtype = (vector == NMI_VECTOR ? APIC_DM_NMI : APIC_DM_FIXED) >> 8; + int_gen.s._index = 0; + + write_lcsr(CSR_G3_EXT_IRQ_GEN, int_gen.v); +} + +static void numachip_send_IPI_mask(const struct cpumask *mask, int vector) +{ + unsigned int cpu; + + for_each_cpu(cpu, mask) + numachip_send_IPI_one(cpu, vector); +} + +static void numachip_send_IPI_mask_allbutself(const struct cpumask *mask, + int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + for_each_cpu(cpu, mask) { + if (cpu != this_cpu) + numachip_send_IPI_one(cpu, vector); + } +} + +static void numachip_send_IPI_allbutself(int vector) +{ + unsigned int this_cpu = smp_processor_id(); + unsigned int cpu; + + for_each_online_cpu(cpu) { + if (cpu != this_cpu) + numachip_send_IPI_one(cpu, vector); + } +} + +static void numachip_send_IPI_all(int vector) +{ + numachip_send_IPI_mask(cpu_online_mask, vector); +} + +static void numachip_send_IPI_self(int vector) +{ + __default_send_IPI_shortcut(APIC_DEST_SELF, vector, APIC_DEST_PHYSICAL); +} + +static unsigned int numachip_cpu_mask_to_apicid(const struct cpumask *cpumask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + cpu = cpumask_first(cpumask); + if (likely((unsigned)cpu < nr_cpu_ids)) + return per_cpu(x86_cpu_to_apicid, cpu); + + return BAD_APICID; +} + +static unsigned int +numachip_cpu_mask_to_apicid_and(const struct cpumask *cpumask, + const struct cpumask *andmask) +{ + int cpu; + + /* + * We're using fixed IRQ delivery, can only return one phys APIC ID. + * May as well be the first. + */ + for_each_cpu_and(cpu, cpumask, andmask) { + if (cpumask_test_cpu(cpu, cpu_online_mask)) + break; + } + return per_cpu(x86_cpu_to_apicid, cpu); +} + +static int __init numachip_probe(void) +{ + return apic == &apic_numachip; +} + +static void __init map_csrs(void) +{ + printk(KERN_INFO "NumaChip: Mapping local CSR space (%016llx - %016llx)\n", + NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_BASE + NUMACHIP_LCSR_SIZE - 1); + init_extra_mapping_uc(NUMACHIP_LCSR_BASE, NUMACHIP_LCSR_SIZE); + + printk(KERN_INFO "NumaChip: Mapping global CSR space (%016llx - %016llx)\n", + NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_BASE + NUMACHIP_GCSR_SIZE - 1); + init_extra_mapping_uc(NUMACHIP_GCSR_BASE, NUMACHIP_GCSR_SIZE); +} + +static void fixup_cpu_id(struct cpuinfo_x86 *c, int node) +{ + c->phys_proc_id = node; + per_cpu(cpu_llc_id, smp_processor_id()) = node; +} + +static int __init numachip_system_init(void) +{ + unsigned int val; + + if (!numachip_system) + return 0; + + x86_cpuinit.fixup_cpu_id = fixup_cpu_id; + + map_csrs(); + + val = read_lcsr(CSR_G0_NODE_IDS); + printk(KERN_INFO "NumaChip: Local NodeID = %08x\n", val); + + return 0; +} +early_initcall(numachip_system_init); + +static int numachip_acpi_madt_oem_check(char *oem_id, char *oem_table_id) +{ + if (!strncmp(oem_id, "NUMASC", 6)) { + numachip_system = 1; + return 1; + } + + return 0; +} + +static struct apic apic_numachip __refconst = { + + .name = "NumaConnect system", + .probe = numachip_probe, + .acpi_madt_oem_check = numachip_acpi_madt_oem_check, + .apic_id_registered = numachip_apic_id_registered, + + .irq_delivery_mode = dest_Fixed, + .irq_dest_mode = 0, /* physical */ + + .target_cpus = numachip_target_cpus, + .disable_esr = 0, + .dest_logical = 0, + .check_apicid_used = NULL, + .check_apicid_present = NULL, + + .vector_allocation_domain = numachip_vector_allocation_domain, + .init_apic_ldr = flat_init_apic_ldr, + + .ioapic_phys_id_map = NULL, + .setup_apic_routing = NULL, + .multi_timer_check = NULL, + .cpu_present_to_apicid = default_cpu_present_to_apicid, + .apicid_to_cpu_present = NULL, + .setup_portio_remap = NULL, + .check_phys_apicid_present = default_check_phys_apicid_present, + .enable_apic_mode = NULL, + .phys_pkg_id = numachip_phys_pkg_id, + .mps_oem_check = NULL, + + .get_apic_id = get_apic_id, + .set_apic_id = set_apic_id, + .apic_id_mask = 0xffU << 24, + + .cpu_mask_to_apicid = numachip_cpu_mask_to_apicid, + .cpu_mask_to_apicid_and = numachip_cpu_mask_to_apicid_and, + + .send_IPI_mask = numachip_send_IPI_mask, + .send_IPI_mask_allbutself = numachip_send_IPI_mask_allbutself, + .send_IPI_allbutself = numachip_send_IPI_allbutself, + .send_IPI_all = numachip_send_IPI_all, + .send_IPI_self = numachip_send_IPI_self, + + .wakeup_secondary_cpu = numachip_wakeup_secondary, + .trampoline_phys_low = DEFAULT_TRAMPOLINE_PHYS_LOW, + .trampoline_phys_high = DEFAULT_TRAMPOLINE_PHYS_HIGH, + .wait_for_init_deassert = NULL, + .smp_callin_clear_local_apic = NULL, + .inquire_remote_apic = NULL, /* REMRD not supported */ + + .read = native_apic_mem_read, + .write = native_apic_mem_write, + .icr_read = native_apic_icr_read, + .icr_write = native_apic_icr_write, + .wait_icr_idle = native_apic_wait_icr_idle, + .safe_wait_icr_idle = native_safe_apic_wait_icr_idle, +}; +apic_driver(apic_numachip); + -- cgit v1.1 From 70ea6855d368588a7f1b0242ab83ca6fe2e2ff16 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 29 Nov 2011 10:54:22 +0000 Subject: x86-64: Slightly shorten int_ret_from_sys_call Testing for a return to ring 0 was necessary here solely because of the branch out of ret_from_fork. That branch, however, can be directed to retint_restore_args, and thus the test-and-branch can be eliminated here. Signed-off-by: Jan Beulich Reviewed-by: Andi Kleen Link: http://lkml.kernel.org/r/4ED4C7EE0200007800064028@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index faf8d5e..ab4b7ff 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -411,7 +411,7 @@ ENTRY(ret_from_fork) RESTORE_REST testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? - je int_ret_from_sys_call + jz retint_restore_args testl $_TIF_IA32, TI_flags(%rcx) # 32-bit compat task needs IRET jnz int_ret_from_sys_call @@ -612,8 +612,6 @@ tracesys: GLOBAL(int_ret_from_sys_call) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $3,CS-ARGOFFSET(%rsp) - je retint_restore_args movl $_TIF_ALLWORK_MASK,%edi /* edi: mask to check */ GLOBAL(int_with_check) -- cgit v1.1 From 39e9543344fa3179e346d2b381c6e0cd17b516de Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 29 Nov 2011 11:03:46 +0000 Subject: x86-64: Reduce amount of redundant code generated for invalidate_interruptNN Previously these up to 32 entry points, consisting of all the same code except for their very first instruction, consumed 0x70 bytes per instance. Just like for device interrupt entry points, fold them together so that they all use a single instance of the code after having pushed their vector indicator (resulting in 0x10 bytes per instance, to retain 16-byte alignment of the individual entry points). Signed-off-by: Jan Beulich Reviewed-by: Andi Kleen Link: http://lkml.kernel.org/r/4ED4CA230200007800064065@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index ab4b7ff..1581f19 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -951,6 +951,7 @@ END(common_interrupt) ENTRY(\sym) INTR_FRAME pushq_cfi $~(\num) +.Lcommon_\sym: interrupt \do_sym jmp ret_from_intr CFI_ENDPROC @@ -974,13 +975,21 @@ apicinterrupt X86_PLATFORM_IPI_VECTOR \ x86_platform_ipi smp_x86_platform_ipi #ifdef CONFIG_SMP -.irp idx,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ + ALIGN + INTR_FRAME +.irp idx,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 .if NUM_INVALIDATE_TLB_VECTORS > \idx -apicinterrupt (INVALIDATE_TLB_VECTOR_START)+\idx \ - invalidate_interrupt\idx smp_invalidate_interrupt +ENTRY(invalidate_interrupt\idx) + pushq_cfi $~(INVALIDATE_TLB_VECTOR_START+\idx) + jmp .Lcommon_invalidate_interrupt0 + CFI_ADJUST_CFA_OFFSET -8 +END(invalidate_interrupt\idx) .endif .endr + CFI_ENDPROC +apicinterrupt INVALIDATE_TLB_VECTOR_START, \ + invalidate_interrupt0, smp_invalidate_interrupt #endif apicinterrupt THRESHOLD_APIC_VECTOR \ -- cgit v1.1 From 46db09d3fd847f185a7d23a96bc8fe7a4be0cd05 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 29 Nov 2011 11:17:45 +0000 Subject: x86-64: Slightly shorten line system call entry and exit paths GET_THREAD_INFO() involves a memory read immediately followed by an "sub" on the value read, in turn (in several cases) immediately followed by a use of the calculated value as the base address of a memory access. This combination of instructions has a non-negligible potential for stalls. In the system call entry point code, however, the (fixed) offset of the stack pointer from the end of the stack is generally known, and hence we can instead avoid the memory load and subtract, and instead do the memory reference using %rsp as the base register. To do so in a legible fashion, introduce a THREAD_INFO() macro which, provided a register (generally %rsp) and the known offset from the end of the stack, produces a suitable memory access operand. The patch attempts to only touch the fast paths (no auditing and alike), but manages to do so only in the 64-bit entry point case; the compatibility mode entry points have so many interdependencies between their various branch targets that it was necessary to also adjust the slow paths to eliminate the risk of having missed some register dependency during code analysis. Signed-off-by: Jan Beulich Reviewed-by: Andi Kleen Link: http://lkml.kernel.org/r/4ED4CD690200007800064075@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1581f19..75f72a5 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -478,8 +478,7 @@ ENTRY(system_call_after_swapgs) movq %rax,ORIG_RAX-ARGOFFSET(%rsp) movq %rcx,RIP-ARGOFFSET(%rsp) CFI_REL_OFFSET rip,RIP-ARGOFFSET - GET_THREAD_INFO(%rcx) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags(%rcx) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jnz tracesys system_call_fastpath: cmpq $__NR_syscall_max,%rax @@ -496,10 +495,9 @@ ret_from_sys_call: /* edi: flagmask */ sysret_check: LOCKDEP_SYS_EXIT - GET_THREAD_INFO(%rcx) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - movl TI_flags(%rcx),%edx + movl TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET),%edx andl %edi,%edx jnz sysret_careful CFI_REMEMBER_STATE @@ -583,7 +581,7 @@ sysret_audit: /* Do syscall tracing */ tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags(%rcx) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) jz auditsys #endif SAVE_REST -- cgit v1.1 From f6b2bc847641ea38e2655c8424fef5d2d19f35f9 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Tue, 29 Nov 2011 11:24:10 +0000 Subject: x86-64: Cleanup some assembly entry points system_call_after_swapgs doesn't really benefit from forcing alignment from it - quite the opposite, native code needlessly so far got a big NOP instruction inserted in front of it. Xen being the only user of the separate entry point can well live with the branch going to three bytes into a cache line. The compatibility mode ptregs entry points for one can make use of the GLOBAL() macro, and should be suitably aligned. Their shared continuation point (ia32_ptregs_common) otoh doesn't need to be global at all, but should continue to be properly aligned. Signed-off-by: Jan Beulich Reviewed-by: Andi Kleen Link: http://lkml.kernel.org/r/4ED4CEEA020000780006407D@nat28.tlf.novell.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 75f72a5..cfad7fce 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -465,7 +465,7 @@ ENTRY(system_call) * after the swapgs, so that it can do the swapgs * for the guest and jump here on syscall. */ -ENTRY(system_call_after_swapgs) +GLOBAL(system_call_after_swapgs) movq %rsp,PER_CPU_VAR(old_rsp) movq PER_CPU_VAR(kernel_stack),%rsp -- cgit v1.1 From 28a00184be261e3dc152ba0d664a067bbe235b6a Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Fri, 4 Nov 2011 15:42:17 -0700 Subject: x86, tsc: Skip TSC synchronization checks for tsc=reliable tsc=reliable boot parameter is supposed to skip all the TSC stablility checks during boot time. On a 8-socket system where we want to run an experiment with the "tsc=reliable" boot option, TSC synchronization checks are not getting skipped and marking the TSC as not stable. Check for tsc_clocksource_reliable (which is set via tsc=reliable or for platforms supporting synthetic TSC_RELIABLE feature bit etc) and when set, skip the TSC synchronization tests during boot. Signed-off-by: Suresh Siddha Acked-by: John Stultz Tested-by: Srivatsa S. Bhat Link: http://lkml.kernel.org/r/1320446537.15071.14.camel@sbsiddha-desk.sc.intel.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 2 +- arch/x86/kernel/tsc_sync.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index db48336..eee4651 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -35,7 +35,7 @@ static int __read_mostly tsc_unstable; erroneous rdtsc usage on !cpu_has_tsc processors */ static int __read_mostly tsc_disabled = -1; -static int tsc_clocksource_reliable; +int tsc_clocksource_reliable; /* * Scheduler clock - returns current time in nanosec units. */ diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index 0aa5fed8..9eba29b 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -113,7 +113,7 @@ void __cpuinit check_tsc_sync_source(int cpu) if (unsynchronized_tsc()) return; - if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { + if (tsc_clocksource_reliable) { if (cpu == (nr_cpu_ids-1) || system_state != SYSTEM_BOOTING) pr_info( "Skipped synchronization checks as TSC is reliable.\n"); @@ -172,7 +172,7 @@ void __cpuinit check_tsc_sync_target(void) { int cpus = 2; - if (unsynchronized_tsc() || boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) + if (unsynchronized_tsc() || tsc_clocksource_reliable) return; /* -- cgit v1.1 From f62ef5f3e9cff065aa845e2b7f487e1810b8e57e Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 2 Dec 2011 08:21:43 +0100 Subject: x86, amd: Fix up numa_node information for AMD CPU family 15h model 0-0fh northbridge functions I've received complaints that the numa_node attribute for family 15h model 00-0fh (e.g. Interlagos) northbridge functions shows -1 instead of the proper node ID. Correct this with attached quirks (similar to quirks for other AMD CPU families used in multi-socket systems). Signed-off-by: Andreas Herrmann Cc: Frank Arnold Cc: Borislav Petkov Link: http://lkml.kernel.org/r/20111202072143.GA31916@alberich.amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/quirks.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index b78643d..03920a1 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -553,4 +553,17 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC, quirk_amd_nb_node); DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F0, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F1, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F2, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F3, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F4, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_15H_NB_F5, + quirk_amd_nb_node); + #endif -- cgit v1.1 From e4a02b4a951a7adf9d982b11c64686570c29fbe7 Mon Sep 17 00:00:00 2001 From: Steffen Persvold Date: Tue, 6 Dec 2011 01:10:31 +0100 Subject: x86: Fix the !CONFIG_NUMA build of the new CPU ID fixup code support I used "ifdef CONFIG_NUMA" simply because it doesn't make sense in a non-numa configuration even with SMP enabled. Besides, the only place where it is called right now is in kernel/cpu/amd.c:srat_detect_node() within the "CONFIG_NUMA" protected part. Signed-off-by: Steffen Persvold Cc: Daniel J Blueman Cc: Jesse Barnes Link: http://lkml.kernel.org/r/1323073238-32686-2-git-send-email-daniel@numascale-asia.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ad4da45..a70bd5b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1146,7 +1146,9 @@ static void dbg_restore_debug_regs(void) */ void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node) { +#ifdef CONFIG_NUMA pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id); +#endif } /* -- cgit v1.1 From 3596ff4e6b2aff8a28c69af389d5046090a53330 Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 25 Oct 2011 19:48:12 +0530 Subject: x86: Call do_notify_resume() with interrupts enabled do_notify_resume() gets called with interrupts disabled on x86_32. This is different from the x86_64 behavior, where interrupts are enabled at the time. Queries on lkml on this issue hasn't yielded any clear answer. Lets make x86_32 behave the same as x86_64, unless there is a real reason to maintain status quo. Please refer https://lkml.org/lkml/2011/9/27/130 for more details. A similar change was suggested in ARM: https://lkml.org/lkml/2011/8/25/231 My 32-bit machine works fine (tm) with this patch. Signed-off-by: Srikar Dronamraju Acked-by: Masami Hiramatsu Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20111025141812.GA21225@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_32.S | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f3f6f53..22d0e21 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -625,6 +625,8 @@ work_notifysig: # deal with pending signals and movl %esp, %eax jne work_notifysig_v86 # returning to kernel-space or # vm86-space + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) xorl %edx, %edx call do_notify_resume jmp resume_userspace_sig @@ -638,6 +640,8 @@ work_notifysig_v86: #else movl %esp, %eax #endif + TRACE_IRQS_ON + ENABLE_INTERRUPTS(CLBR_NONE) xorl %edx, %edx call do_notify_resume jmp resume_userspace_sig -- cgit v1.1 From cc3a1bf52a9d2808c7cd6e8f413b02b650b6b84b Mon Sep 17 00:00:00 2001 From: Srikar Dronamraju Date: Tue, 25 Oct 2011 19:51:59 +0530 Subject: x86: Clean up and extend do_int3() Since there is a possibility of !KPROBES int3 listeners (such as kgdb) and since DIE_TRAP is currently not being used by anybody, notify all listeners with DIE_INT3. Signed-off-by: Ananth N Mavinakayanahalli Signed-off-by: Srikar Dronamraju Signed-off-by: Peter Zijlstra Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20111025142159.GB21225@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/traps.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a8e3eb8..fa1191fb 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -306,15 +306,10 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) == NOTIFY_STOP) return; #endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */ -#ifdef CONFIG_KPROBES + if (notify_die(DIE_INT3, "int3", regs, error_code, 3, SIGTRAP) == NOTIFY_STOP) return; -#else - if (notify_die(DIE_TRAP, "int3", regs, error_code, 3, SIGTRAP) - == NOTIFY_STOP) - return; -#endif preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); -- cgit v1.1 From 1e2ad28f80b4e155678259238f51edebc19e4014 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 18 Nov 2011 12:35:21 +0100 Subject: perf, x86: Implement event scheduler helper functions This patch introduces x86 perf scheduler code helper functions. We need this to later add more complex functionality to support overlapping counter constraints (next patch). The algorithm is modified so that the range of weight values is now generated from the constraints. There shouldn't be other functional changes. With the helper functions the scheduler is controlled. There are functions to initialize, traverse the event list, find unused counters etc. The scheduler keeps its own state. V3: * Added macro for_each_set_bit_cont(). * Changed functions interfaces of perf_sched_find_counter() and perf_sched_next_event() to use bool as return value. * Added some comments to make code better understandable. V4: * Fix broken event assignment if weight of the first event is not wmin (perf_sched_init()). Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1321616122-1533-2-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 185 ++++++++++++++++++++++++++++----------- 1 file changed, 132 insertions(+), 53 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 2bda212..5a469d3 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -484,18 +484,145 @@ static inline int is_x86_event(struct perf_event *event) return event->pmu == &pmu; } +/* + * Event scheduler state: + * + * Assign events iterating over all events and counters, beginning + * with events with least weights first. Keep the current iterator + * state in struct sched_state. + */ +struct sched_state { + int weight; + int event; /* event index */ + int counter; /* counter index */ + int unassigned; /* number of events to be assigned left */ + unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; +}; + +struct perf_sched { + int max_weight; + int max_events; + struct event_constraint **constraints; + struct sched_state state; +}; + +/* + * Initialize interator that runs through all events and counters. + */ +static void perf_sched_init(struct perf_sched *sched, struct event_constraint **c, + int num, int wmin, int wmax) +{ + int idx; + + memset(sched, 0, sizeof(*sched)); + sched->max_events = num; + sched->max_weight = wmax; + sched->constraints = c; + + for (idx = 0; idx < num; idx++) { + if (c[idx]->weight == wmin) + break; + } + + sched->state.event = idx; /* start with min weight */ + sched->state.weight = wmin; + sched->state.unassigned = num; +} + +/* + * Select a counter for the current event to schedule. Return true on + * success. + */ +static bool perf_sched_find_counter(struct perf_sched *sched) +{ + struct event_constraint *c; + int idx; + + if (!sched->state.unassigned) + return false; + + if (sched->state.event >= sched->max_events) + return false; + + c = sched->constraints[sched->state.event]; + + /* Grab the first unused counter starting with idx */ + idx = sched->state.counter; + for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { + if (!__test_and_set_bit(idx, sched->state.used)) + break; + } + sched->state.counter = idx; + + if (idx >= X86_PMC_IDX_MAX) + return false; + + return true; +} + +/* + * Go through all unassigned events and find the next one to schedule. + * Take events with the least weight first. Return true on success. + */ +static bool perf_sched_next_event(struct perf_sched *sched) +{ + struct event_constraint *c; + + if (!sched->state.unassigned || !--sched->state.unassigned) + return false; + + do { + /* next event */ + sched->state.event++; + if (sched->state.event >= sched->max_events) { + /* next weight */ + sched->state.event = 0; + sched->state.weight++; + if (sched->state.weight > sched->max_weight) + return false; + } + c = sched->constraints[sched->state.event]; + } while (c->weight != sched->state.weight); + + sched->state.counter = 0; /* start with first counter */ + + return true; +} + +/* + * Assign a counter for each event. + */ +static int perf_assign_events(struct event_constraint **constraints, int n, + int wmin, int wmax, int *assign) +{ + struct perf_sched sched; + + perf_sched_init(&sched, constraints, n, wmin, wmax); + + do { + if (!perf_sched_find_counter(&sched)) + break; /* failed */ + if (assign) + assign[sched.state.event] = sched.state.counter; + } while (perf_sched_next_event(&sched)); + + return sched.state.unassigned; +} + int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) { struct event_constraint *c, *constraints[X86_PMC_IDX_MAX]; unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - int i, j, w, wmax, num = 0; + int i, wmin, wmax, num = 0; struct hw_perf_event *hwc; bitmap_zero(used_mask, X86_PMC_IDX_MAX); - for (i = 0; i < n; i++) { + for (i = 0, wmin = X86_PMC_IDX_MAX, wmax = 0; i < n; i++) { c = x86_pmu.get_event_constraints(cpuc, cpuc->event_list[i]); constraints[i] = c; + wmin = min(wmin, c->weight); + wmax = max(wmax, c->weight); } /* @@ -521,59 +648,11 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (assign) assign[i] = hwc->idx; } - if (i == n) - goto done; - /* - * begin slow path - */ + /* slow path */ + if (i != n) + num = perf_assign_events(constraints, n, wmin, wmax, assign); - bitmap_zero(used_mask, X86_PMC_IDX_MAX); - - /* - * weight = number of possible counters - * - * 1 = most constrained, only works on one counter - * wmax = least constrained, works on any counter - * - * assign events to counters starting with most - * constrained events. - */ - wmax = x86_pmu.num_counters; - - /* - * when fixed event counters are present, - * wmax is incremented by 1 to account - * for one more choice - */ - if (x86_pmu.num_counters_fixed) - wmax++; - - for (w = 1, num = n; num && w <= wmax; w++) { - /* for each event */ - for (i = 0; num && i < n; i++) { - c = constraints[i]; - hwc = &cpuc->event_list[i]->hw; - - if (c->weight != w) - continue; - - for_each_set_bit(j, c->idxmsk, X86_PMC_IDX_MAX) { - if (!test_bit(j, used_mask)) - break; - } - - if (j == X86_PMC_IDX_MAX) - break; - - __set_bit(j, used_mask); - - if (assign) - assign[i] = j; - num--; - } - } -done: /* * scheduling failed or is just a simulation, * free resources if necessary -- cgit v1.1 From bc1738f6ee83015f090867813dcca4d690e7917c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 18 Nov 2011 12:35:22 +0100 Subject: perf, x86: Fix event scheduler for constraints with overlapping counters The current x86 event scheduler fails to resolve scheduling problems of certain combinations of events and constraints. This happens if the counter mask of such an event is not a subset of any other counter mask of a constraint with an equal or higher weight, e.g. constraints of the AMD family 15h pmu: counter mask weight amd_f15_PMC30 0x09 2 <--- overlapping counters amd_f15_PMC20 0x07 3 amd_f15_PMC53 0x38 3 The scheduler does not find then an existing solution. Here is an example: event code counter failure possible solution 0x02E PMC[3,0] 0 3 0x043 PMC[2:0] 1 0 0x045 PMC[2:0] 2 1 0x046 PMC[2:0] FAIL 2 The event scheduler may not select the correct counter in the first cycle because it needs to know which subsequent events will be scheduled. It may fail to schedule the events then. To solve this, we now save the scheduler state of events with overlapping counter counstraints. If we fail to schedule the events we rollback to those states and try to use another free counter. Constraints with overlapping counters are marked with a new introduced overlap flag. We set the overlap flag for such constraints to give the scheduler a hint which events to select for counter rescheduling. The EVENT_CONSTRAINT_OVERLAP() macro can be used for this. Care must be taken as the rescheduling algorithm is O(n!) which will increase scheduling cycles for an over-commited system dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros and its counter masks must be kept at a minimum. Thus, the current stack is limited to 2 states to limit the number of loops the algorithm takes in the worst case. On systems with no overlapping-counter constraints, this implementation does not increase the loop count compared to the previous algorithm. V2: * Renamed redo -> overlap. * Reimplementation using perf scheduling helper functions. V3: * Added WARN_ON_ONCE() if out of save states. * Changed function interface of perf_sched_restore_state() to use bool as return value. Signed-off-by: Robert Richter Signed-off-by: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/r/1321616122-1533-3-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 45 ++++++++++++++++++++++++++++++++++-- arch/x86/kernel/cpu/perf_event.h | 30 ++++++++++++++++++++++-- arch/x86/kernel/cpu/perf_event_amd.c | 2 +- 3 files changed, 72 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 5a469d3..fa6fdec 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -499,11 +499,16 @@ struct sched_state { unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; }; +/* Total max is X86_PMC_IDX_MAX, but we are O(n!) limited */ +#define SCHED_STATES_MAX 2 + struct perf_sched { int max_weight; int max_events; struct event_constraint **constraints; struct sched_state state; + int saved_states; + struct sched_state saved[SCHED_STATES_MAX]; }; /* @@ -529,11 +534,34 @@ static void perf_sched_init(struct perf_sched *sched, struct event_constraint ** sched->state.unassigned = num; } +static void perf_sched_save_state(struct perf_sched *sched) +{ + if (WARN_ON_ONCE(sched->saved_states >= SCHED_STATES_MAX)) + return; + + sched->saved[sched->saved_states] = sched->state; + sched->saved_states++; +} + +static bool perf_sched_restore_state(struct perf_sched *sched) +{ + if (!sched->saved_states) + return false; + + sched->saved_states--; + sched->state = sched->saved[sched->saved_states]; + + /* continue with next counter: */ + clear_bit(sched->state.counter++, sched->state.used); + + return true; +} + /* * Select a counter for the current event to schedule. Return true on * success. */ -static bool perf_sched_find_counter(struct perf_sched *sched) +static bool __perf_sched_find_counter(struct perf_sched *sched) { struct event_constraint *c; int idx; @@ -557,6 +585,19 @@ static bool perf_sched_find_counter(struct perf_sched *sched) if (idx >= X86_PMC_IDX_MAX) return false; + if (c->overlap) + perf_sched_save_state(sched); + + return true; +} + +static bool perf_sched_find_counter(struct perf_sched *sched) +{ + while (!__perf_sched_find_counter(sched)) { + if (!perf_sched_restore_state(sched)) + return false; + } + return true; } @@ -1250,7 +1291,7 @@ static int __init init_hw_perf_events(void) unconstrained = (struct event_constraint) __EVENT_CONSTRAINT(0, (1ULL << x86_pmu.num_counters) - 1, - 0, x86_pmu.num_counters); + 0, x86_pmu.num_counters, 0); if (x86_pmu.event_constraints) { for_each_event_constraint(c, x86_pmu.event_constraints) { diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index b9698d4..51a985c 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -45,6 +45,7 @@ struct event_constraint { u64 code; u64 cmask; int weight; + int overlap; }; struct amd_nb { @@ -151,15 +152,40 @@ struct cpu_hw_events { void *kfree_on_online; }; -#define __EVENT_CONSTRAINT(c, n, m, w) {\ +#define __EVENT_CONSTRAINT(c, n, m, w, o) {\ { .idxmsk64 = (n) }, \ .code = (c), \ .cmask = (m), \ .weight = (w), \ + .overlap = (o), \ } #define EVENT_CONSTRAINT(c, n, m) \ - __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n)) + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0) + +/* + * The overlap flag marks event constraints with overlapping counter + * masks. This is the case if the counter mask of such an event is not + * a subset of any other counter mask of a constraint with an equal or + * higher weight, e.g.: + * + * c_overlaps = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); + * c_another1 = EVENT_CONSTRAINT(0, 0x07, 0); + * c_another2 = EVENT_CONSTRAINT(0, 0x38, 0); + * + * The event scheduler may not select the correct counter in the first + * cycle because it needs to know which subsequent events will be + * scheduled. It may fail to schedule the events then. So we set the + * overlap flag for such constraints to give the scheduler a hint which + * events to select for counter rescheduling. + * + * Care must be taken as the rescheduling algorithm is O(n!) which + * will increase scheduling cycles for an over-commited system + * dramatically. The number of such EVENT_CONSTRAINT_OVERLAP() macros + * and its counter masks must be kept at a minimum. + */ +#define EVENT_CONSTRAINT_OVERLAP(c, n, m) \ + __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 1) /* * Constraint on the Event code. diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index aeefd45..0397b23 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -492,7 +492,7 @@ static __initconst const struct x86_pmu amd_pmu = { static struct event_constraint amd_f15_PMC0 = EVENT_CONSTRAINT(0, 0x01, 0); static struct event_constraint amd_f15_PMC20 = EVENT_CONSTRAINT(0, 0x07, 0); static struct event_constraint amd_f15_PMC3 = EVENT_CONSTRAINT(0, 0x08, 0); -static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT(0, 0x09, 0); +static struct event_constraint amd_f15_PMC30 = EVENT_CONSTRAINT_OVERLAP(0, 0x09, 0); static struct event_constraint amd_f15_PMC50 = EVENT_CONSTRAINT(0, 0x3F, 0); static struct event_constraint amd_f15_PMC53 = EVENT_CONSTRAINT(0, 0x38, 0); -- cgit v1.1 From 4defea8559bc0f97a899d94c8d19d3b8bb802bc4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 10 Nov 2011 15:15:42 +0100 Subject: perf, x86: Prefer fixed-purpose counters when scheduling This avoids a scheduling failure for cases like: cycles, cycles, instructions, instructions (on Core2) Which would end up being programmed like: PMC0, PMC1, FP-instructions, fail Because all events will have the same weight. Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-8tnwb92asqj7xajqqoty4gel@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index fa6fdec..66f8ba9 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -574,16 +574,25 @@ static bool __perf_sched_find_counter(struct perf_sched *sched) c = sched->constraints[sched->state.event]; + /* Prefer fixed purpose counters */ + if (x86_pmu.num_counters_fixed) { + idx = X86_PMC_IDX_FIXED; + for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { + if (!__test_and_set_bit(idx, sched->state.used)) + goto done; + } + } /* Grab the first unused counter starting with idx */ idx = sched->state.counter; - for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_MAX) { + for_each_set_bit_cont(idx, c->idxmsk, X86_PMC_IDX_FIXED) { if (!__test_and_set_bit(idx, sched->state.used)) - break; + goto done; } - sched->state.counter = idx; - if (idx >= X86_PMC_IDX_MAX) - return false; + return false; + +done: + sched->state.counter = idx; if (c->overlap) perf_sched_save_state(sched); -- cgit v1.1 From 1cf8343f55525c09c88da0a494a96e1b034f84e2 Mon Sep 17 00:00:00 2001 From: Seiichi Ikarashi Date: Tue, 6 Dec 2011 17:58:14 +0900 Subject: x86: Fix rflags in FAKE_STACK_FRAME The x86_64 kernel pushes the fake kernel stack in arch/x86/kernel/entry_64.S:FAKE_STACK_FRAME, and rflags register in it does not conform to the specification. Although Intel's manual[1] says bit 1 of it shall be set to 1, this bit is cleared to 0 on pushing the fake stack. [1] Intel(R) 64 and IA-32 Architectures Software Developer's Manual Vol.1 3-21 Figure 3-8. EFLAGS Register If it is not on purpose, it is better to be fixed, because it can lead some tools misunderstanding the stack frame. For example, "crash" utility[2] actually detects it and warns you like below: RIP: ffffffff8005dfa2 RSP: ffff8104ce0c7f58 RFLAGS: 00000200 [...] bt: WARNING: possibly bogus exception frame Signed-off-by: Seiichi Ikarashi Tested-by: Masayoshi MIZUMA Cc: Jan Beulich Cc: Frederic Weisbecker Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 2 +- arch/x86/kernel/process.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index cfad7fce..a20e1cb 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -221,7 +221,7 @@ ENDPROC(native_usergs_sysret64) /*CFI_REL_OFFSET ss,0*/ pushq_cfi %rax /* rsp */ CFI_REL_OFFSET rsp,0 - pushq_cfi $X86_EFLAGS_IF /* eflags - interrupts on */ + pushq_cfi $(X86_EFLAGS_IF|X86_EFLAGS_BIT1) /* eflags - interrupts on */ /*CFI_REL_OFFSET rflags,0*/ pushq_cfi $__KERNEL_CS /* cs */ /*CFI_REL_OFFSET cs,0*/ diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ee5d4fb..15763af 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -293,7 +293,7 @@ int kernel_thread(int (*fn)(void *), void *arg, unsigned long flags) regs.orig_ax = -1; regs.ip = (unsigned long) kernel_thread_helper; regs.cs = __KERNEL_CS | get_kernel_rpl(); - regs.flags = X86_EFLAGS_IF | 0x2; + regs.flags = X86_EFLAGS_IF | X86_EFLAGS_BIT1; /* Ok, create the new process.. */ return do_fork(flags | CLONE_VM | CLONE_UNTRACED, 0, ®s, 0, NULL, NULL); -- cgit v1.1 From 9cdbe1cbac4ec318037297175587a0080acc9d11 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 6 Dec 2011 17:27:29 +0100 Subject: jump_label, x86: Fix section mismatch WARNING: arch/x86/kernel/built-in.o(.text+0x4c71): Section mismatch in reference from the function arch_jump_label_transform_static() to the function .init.text:text_poke_early() The function arch_jump_label_transform_static() references the function __init text_poke_early(). This is often because arch_jump_label_transform_static lacks a __init annotation or the annotation of text_poke_early is wrong. Signed-off-by: Peter Zijlstra Cc: Jason Baron Link: http://lkml.kernel.org/n/tip-9lefe89mrvurrwpqw5h8xm8z@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/jump_label.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/jump_label.c b/arch/x86/kernel/jump_label.c index ea9d5f2f..2889b3d 100644 --- a/arch/x86/kernel/jump_label.c +++ b/arch/x86/kernel/jump_label.c @@ -50,7 +50,7 @@ void arch_jump_label_transform(struct jump_entry *entry, put_online_cpus(); } -void arch_jump_label_transform_static(struct jump_entry *entry, +__init_or_module void arch_jump_label_transform_static(struct jump_entry *entry, enum jump_label_type type) { __jump_label_transform(entry, type, text_poke_early); -- cgit v1.1 From ffb871bc9156ee2e5cf442f61250c5bd6aad17e3 Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 10 Nov 2011 14:57:26 +0200 Subject: x86, perf: Disable non available architectural events Intel CPUs report non-available architectural events in cpuid leaf 0AH.EBX. Use it to disable events that are not available according to CPU. Signed-off-by: Gleb Natapov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1320929850-10480-7-git-send-email-gleb@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.h | 5 +++++ arch/x86/kernel/cpu/perf_event_intel.c | 28 +++++++++++++++++++++++----- 2 files changed, 28 insertions(+), 5 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index 51a985c..f49c5c2 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -285,6 +285,11 @@ struct x86_pmu { int num_counters_fixed; int cntval_bits; u64 cntval_mask; + union { + unsigned long events_maskl; + unsigned long events_mask[BITS_TO_LONGS(ARCH_PERFMON_EVENTS_COUNT)]; + }; + int events_mask_len; int apic; u64 max_period; struct event_constraint * diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 8d601b1..201156b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1552,13 +1552,23 @@ static void intel_sandybridge_quirks(void) x86_pmu.pebs_constraints = NULL; } +static const int intel_event_id_to_hw_id[] __initconst = { + PERF_COUNT_HW_CPU_CYCLES, + PERF_COUNT_HW_INSTRUCTIONS, + PERF_COUNT_HW_BUS_CYCLES, + PERF_COUNT_HW_CACHE_REFERENCES, + PERF_COUNT_HW_CACHE_MISSES, + PERF_COUNT_HW_BRANCH_INSTRUCTIONS, + PERF_COUNT_HW_BRANCH_MISSES, +}; + __init int intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; + union cpuid10_ebx ebx; unsigned int unused; - unsigned int ebx; - int version; + int version, bit; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { switch (boot_cpu_data.x86) { @@ -1574,8 +1584,8 @@ __init int intel_pmu_init(void) * Check whether the Architectural PerfMon supports * Branch Misses Retired hw_event or not. */ - cpuid(10, &eax.full, &ebx, &unused, &edx.full); - if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) + cpuid(10, &eax.full, &ebx.full, &unused, &edx.full); + if (eax.split.mask_length < ARCH_PERFMON_EVENTS_COUNT) return -ENODEV; version = eax.split.version_id; @@ -1651,7 +1661,7 @@ __init int intel_pmu_init(void) /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; - if (ebx & 0x40) { + if (ebx.split.no_branch_misses_retired) { /* * Erratum AAJ80 detected, we work it around by using * the BR_MISP_EXEC.ANY event. This will over-count @@ -1659,6 +1669,7 @@ __init int intel_pmu_init(void) * architectural event which is often completely bogus: */ intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; + ebx.split.no_branch_misses_retired = 0; pr_cont("erratum AAJ80 worked around, "); } @@ -1738,5 +1749,12 @@ __init int intel_pmu_init(void) break; } } + x86_pmu.events_maskl = ebx.full; + x86_pmu.events_mask_len = eax.split.mask_length; + + /* disable event that reported as not presend by cpuid */ + for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_event_id_to_hw_id)) + intel_perfmon_event_map[intel_event_id_to_hw_id[bit]] = 0; + return 0; } -- cgit v1.1 From c1d6f42f1a42c721513e2f388c208e5348004f64 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 6 Dec 2011 14:07:15 +0100 Subject: perf, x86: Implement arch event mask as quirk Implement the disabling of arch events as a quirk so that we can print a message along with it. This creates some visibility into the problem space and could allow us to work on adding more work-around like the AAJ80 one. Requested-by: Ingo Molnar Cc: Gleb Natapov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/n/tip-wcja2z48wklzu1b0nkz0a5y7@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 5 ++- arch/x86/kernel/cpu/perf_event.h | 16 ++++++- arch/x86/kernel/cpu/perf_event_intel.c | 80 +++++++++++++++++++++------------- 3 files changed, 68 insertions(+), 33 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 66f8ba9..55889e0 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1248,6 +1248,7 @@ static void __init pmu_check_apic(void) static int __init init_hw_perf_events(void) { + struct x86_pmu_quirk *quirk; struct event_constraint *c; int err; @@ -1276,8 +1277,8 @@ static int __init init_hw_perf_events(void) pr_cont("%s PMU driver.\n", x86_pmu.name); - if (x86_pmu.quirks) - x86_pmu.quirks(); + for (quirk = x86_pmu.quirks; quirk; quirk = quirk->next) + quirk->func(); if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { WARN(1, KERN_ERR "hw perf events %d > max(%d), clipping!", diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index f49c5c2..8944062 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -261,6 +261,11 @@ union perf_capabilities { u64 capabilities; }; +struct x86_pmu_quirk { + struct x86_pmu_quirk *next; + void (*func)(void); +}; + /* * struct x86_pmu - generic x86 pmu */ @@ -299,7 +304,7 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); struct event_constraint *event_constraints; - void (*quirks)(void); + struct x86_pmu_quirk *quirks; int perfctr_second_write; int (*cpu_prepare)(int cpu); @@ -340,6 +345,15 @@ struct x86_pmu { struct perf_guest_switch_msr *(*guest_get_msrs)(int *nr); }; +#define x86_add_quirk(func_) \ +do { \ + static struct x86_pmu_quirk __quirk __initdata = { \ + .func = func_, \ + }; \ + __quirk.next = x86_pmu.quirks; \ + x86_pmu.quirks = &__quirk; \ +} while (0) + #define ERF_NO_HT_SHARING 1 #define ERF_HAS_RSP_1 2 diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 201156b..2c3bf53 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1519,7 +1519,7 @@ static __initconst const struct x86_pmu intel_pmu = { .guest_get_msrs = intel_guest_get_msrs, }; -static void intel_clovertown_quirks(void) +static __init void intel_clovertown_quirk(void) { /* * PEBS is unreliable due to: @@ -1545,30 +1545,61 @@ static void intel_clovertown_quirks(void) x86_pmu.pebs_constraints = NULL; } -static void intel_sandybridge_quirks(void) +static __init void intel_sandybridge_quirk(void) { printk(KERN_WARNING "PEBS disabled due to CPU errata.\n"); x86_pmu.pebs = 0; x86_pmu.pebs_constraints = NULL; } -static const int intel_event_id_to_hw_id[] __initconst = { - PERF_COUNT_HW_CPU_CYCLES, - PERF_COUNT_HW_INSTRUCTIONS, - PERF_COUNT_HW_BUS_CYCLES, - PERF_COUNT_HW_CACHE_REFERENCES, - PERF_COUNT_HW_CACHE_MISSES, - PERF_COUNT_HW_BRANCH_INSTRUCTIONS, - PERF_COUNT_HW_BRANCH_MISSES, +static const struct { int id; char *name; } intel_arch_events_map[] __initconst = { + { PERF_COUNT_HW_CPU_CYCLES, "cpu cycles" }, + { PERF_COUNT_HW_INSTRUCTIONS, "instructions" }, + { PERF_COUNT_HW_BUS_CYCLES, "bus cycles" }, + { PERF_COUNT_HW_CACHE_REFERENCES, "cache references" }, + { PERF_COUNT_HW_CACHE_MISSES, "cache misses" }, + { PERF_COUNT_HW_BRANCH_INSTRUCTIONS, "branch instructions" }, + { PERF_COUNT_HW_BRANCH_MISSES, "branch misses" }, }; +static __init void intel_arch_events_quirk(void) +{ + int bit; + + /* disable event that reported as not presend by cpuid */ + for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_arch_events_map)) { + intel_perfmon_event_map[intel_arch_events_map[bit].id] = 0; + printk(KERN_WARNING "CPUID marked event: \'%s\' unavailable\n", + intel_arch_events_map[bit].name); + } +} + +static __init void intel_nehalem_quirk(void) +{ + union cpuid10_ebx ebx; + + ebx.full = x86_pmu.events_maskl; + if (ebx.split.no_branch_misses_retired) { + /* + * Erratum AAJ80 detected, we work it around by using + * the BR_MISP_EXEC.ANY event. This will over-count + * branch-misses, but it's still much better than the + * architectural event which is often completely bogus: + */ + intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; + ebx.split.no_branch_misses_retired = 0; + x86_pmu.events_maskl = ebx.full; + printk(KERN_INFO "CPU erratum AAJ80 worked around\n"); + } +} + __init int intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; union cpuid10_ebx ebx; unsigned int unused; - int version, bit; + int version; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) { switch (boot_cpu_data.x86) { @@ -1599,6 +1630,9 @@ __init int intel_pmu_init(void) x86_pmu.cntval_bits = eax.split.bit_width; x86_pmu.cntval_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.events_maskl = ebx.full; + x86_pmu.events_mask_len = eax.split.mask_length; + /* * Quirk: v2 perfmon does not report fixed-purpose events, so * assume at least 3 events: @@ -1618,6 +1652,8 @@ __init int intel_pmu_init(void) intel_ds_init(); + x86_add_quirk(intel_arch_events_quirk); /* Install first, so it runs last */ + /* * Install the hw-cache-events table: */ @@ -1627,7 +1663,7 @@ __init int intel_pmu_init(void) break; case 15: /* original 65 nm celeron/pentium/core2/xeon, "Merom"/"Conroe" */ - x86_pmu.quirks = intel_clovertown_quirks; + x86_add_quirk(intel_clovertown_quirk); case 22: /* single-core 65 nm celeron/core2solo "Merom-L"/"Conroe-L" */ case 23: /* current 45 nm celeron/core2/xeon "Penryn"/"Wolfdale" */ case 29: /* six-core 45 nm xeon "Dunnington" */ @@ -1661,18 +1697,8 @@ __init int intel_pmu_init(void) /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; - if (ebx.split.no_branch_misses_retired) { - /* - * Erratum AAJ80 detected, we work it around by using - * the BR_MISP_EXEC.ANY event. This will over-count - * branch-misses, but it's still much better than the - * architectural event which is often completely bogus: - */ - intel_perfmon_event_map[PERF_COUNT_HW_BRANCH_MISSES] = 0x7f89; - ebx.split.no_branch_misses_retired = 0; + x86_add_quirk(intel_nehalem_quirk); - pr_cont("erratum AAJ80 worked around, "); - } pr_cont("Nehalem events, "); break; @@ -1712,7 +1738,7 @@ __init int intel_pmu_init(void) break; case 42: /* SandyBridge */ - x86_pmu.quirks = intel_sandybridge_quirks; + x86_add_quirk(intel_sandybridge_quirk); case 45: /* SandyBridge, "Romely-EP" */ memcpy(hw_cache_event_ids, snb_hw_cache_event_ids, sizeof(hw_cache_event_ids)); @@ -1749,12 +1775,6 @@ __init int intel_pmu_init(void) break; } } - x86_pmu.events_maskl = ebx.full; - x86_pmu.events_mask_len = eax.split.mask_length; - - /* disable event that reported as not presend by cpuid */ - for_each_set_bit(bit, x86_pmu.events_mask, ARRAY_SIZE(intel_event_id_to_hw_id)) - intel_perfmon_event_map[intel_event_id_to_hw_id[bit]] = 0; return 0; } -- cgit v1.1 From b3d9468a8bd218a695e3a0ff112cd4efd27b670a Mon Sep 17 00:00:00 2001 From: Gleb Natapov Date: Thu, 10 Nov 2011 14:57:27 +0200 Subject: perf, x86: Expose perf capability to other modules KVM needs to know perf capability to decide which PMU it can expose to a guest. Signed-off-by: Gleb Natapov Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1320929850-10480-8-git-send-email-gleb@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 55889e0..930fe48 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1696,3 +1696,15 @@ unsigned long perf_misc_flags(struct pt_regs *regs) return misc; } + +void perf_get_x86_pmu_capability(struct x86_pmu_capability *cap) +{ + cap->version = x86_pmu.version; + cap->num_counters_gp = x86_pmu.num_counters; + cap->num_counters_fixed = x86_pmu.num_counters_fixed; + cap->bit_width_gp = x86_pmu.cntval_bits; + cap->bit_width_fixed = x86_pmu.cntval_bits; + cap->events_mask = (unsigned int)x86_pmu.events_maskl; + cap->events_mask_len = x86_pmu.events_mask_len; +} +EXPORT_SYMBOL_GPL(perf_get_x86_pmu_capability); -- cgit v1.1 From fe091c208a40299fba40e62292a610fb91e44b4e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Dec 2011 10:22:07 -0800 Subject: memblock: Kill memblock_init() memblock_init() initializes arrays for regions and memblock itself; however, all these can be done with struct initializers and memblock_init() can be removed. This patch kills memblock_init() and initializes memblock with struct initializer. The only difference is that the first dummy entries don't have .nid set to MAX_NUMNODES initially. This doesn't cause any behavior difference. Signed-off-by: Tejun Heo Cc: Benjamin Herrenschmidt Cc: Yinghai Lu Cc: Russell King Cc: Michal Simek Cc: Paul Mundt Cc: "David S. Miller" Cc: Guan Xuetao Cc: "H. Peter Anvin" --- arch/x86/kernel/head32.c | 2 -- arch/x86/kernel/head64.c | 2 -- 2 files changed, 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/head32.c b/arch/x86/kernel/head32.c index be9282b..51ff186 100644 --- a/arch/x86/kernel/head32.c +++ b/arch/x86/kernel/head32.c @@ -31,8 +31,6 @@ static void __init i386_default_early_setup(void) void __init i386_start_kernel(void) { - memblock_init(); - memblock_reserve(__pa_symbol(&_text), __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index fd25b11..3a3b779 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -98,8 +98,6 @@ void __init x86_64_start_reservations(char *real_mode_data) { copy_bootdata(__va(real_mode_data)); - memblock_init(); - memblock_reserve(__pa_symbol(&_text), __pa_symbol(&__bss_stop) - __pa_symbol(&_text)); -- cgit v1.1 From 1aadc0560f46530f8a0f11055285b876a8a31770 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 8 Dec 2011 10:22:08 -0800 Subject: memblock: s/memblock_analyze()/memblock_allow_resize()/ and update users The only function of memblock_analyze() is now allowing resize of memblock region arrays. Rename it to memblock_allow_resize() and update its users. * The following users remain the same other than renaming. arm/mm/init.c::arm_memblock_init() microblaze/kernel/prom.c::early_init_devtree() powerpc/kernel/prom.c::early_init_devtree() openrisc/kernel/prom.c::early_init_devtree() sh/mm/init.c::paging_init() sparc/mm/init_64.c::paging_init() unicore32/mm/init.c::uc32_memblock_init() * In the following users, analyze was used to update total size which is no longer necessary. powerpc/kernel/machine_kexec.c::reserve_crashkernel() powerpc/kernel/prom.c::early_init_devtree() powerpc/mm/init_32.c::MMU_init() powerpc/mm/tlb_nohash.c::__early_init_mmu() powerpc/platforms/ps3/mm.c::ps3_mm_add_memory() powerpc/platforms/embedded6xx/wii.c::wii_memory_fixups() sh/kernel/machine_kexec.c::reserve_crashkernel() * x86/kernel/e820.c::memblock_x86_fill() was directly setting memblock_can_resize before populating memblock and calling analyze afterwards. Call memblock_allow_resize() before start populating. memblock_can_resize is now static inside memblock.c. Signed-off-by: Tejun Heo Cc: Benjamin Herrenschmidt Cc: Yinghai Lu Cc: Russell King Cc: Michal Simek Cc: Paul Mundt Cc: "David S. Miller" Cc: Guan Xuetao Cc: "H. Peter Anvin" --- arch/x86/kernel/e820.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 056e65d..8071e2f 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1072,7 +1072,7 @@ void __init memblock_x86_fill(void) * We are safe to enable resizing, beause memblock_x86_fill() * is rather later for x86 */ - memblock_can_resize = 1; + memblock_allow_resize(); for (i = 0; i < e820.nr_map; i++) { struct e820entry *ei = &e820.map[i]; @@ -1087,7 +1087,6 @@ void __init memblock_x86_fill(void) memblock_add(ei->addr, ei->size); } - memblock_analyze(); memblock_dump_all(); } -- cgit v1.1 From 2ded6e6a94c98ea453a156748cb7fabaf39a76b9 Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Fri, 18 Nov 2011 16:33:06 +0100 Subject: x86, hpet: Immediately disable HPET timer 1 if rtc irq is masked When HPET is operating in RTC mode, the TN_ENABLE bit on timer1 controls whether the HPET or the RTC delivers interrupts to irq8. When the system goes into suspend, the RTC driver sends a signal to the HPET driver so that the HPET releases control of irq8, allowing the RTC to wake the system from suspend. The switchover is accomplished by a write to the HPET configuration registers which currently only occurs while servicing the HPET interrupt. On some systems, I have seen the system suspend before an HPET interrupt occurs, preventing the write to the HPET configuration register and leaving the HPET in control of the irq8. As the HPET is not active during suspend, it does not generate a wake signal and RTC alarms do not work. This patch forces the HPET driver to immediately transfer control of the irq8 channel to the RTC instead of waiting until the next interrupt event. Signed-off-by: Mark Langsdorf Link: http://lkml.kernel.org/r/20111118153306.GB16319@alberich.amd.com Tested-by: Andreas Herrmann Signed-off-by: Andreas Herrmann Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org --- arch/x86/kernel/hpet.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index b946a9e..1bb0bf4 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -1049,6 +1049,14 @@ int hpet_rtc_timer_init(void) } EXPORT_SYMBOL_GPL(hpet_rtc_timer_init); +static void hpet_disable_rtc_channel(void) +{ + unsigned long cfg; + cfg = hpet_readl(HPET_T1_CFG); + cfg &= ~HPET_TN_ENABLE; + hpet_writel(cfg, HPET_T1_CFG); +} + /* * The functions below are called from rtc driver. * Return 0 if HPET is not being used. @@ -1060,6 +1068,9 @@ int hpet_mask_rtc_irq_bit(unsigned long bit_mask) return 0; hpet_rtc_flags &= ~bit_mask; + if (unlikely(!hpet_rtc_flags)) + hpet_disable_rtc_channel(); + return 1; } EXPORT_SYMBOL_GPL(hpet_mask_rtc_irq_bit); @@ -1125,15 +1136,11 @@ EXPORT_SYMBOL_GPL(hpet_rtc_dropped_irq); static void hpet_rtc_timer_reinit(void) { - unsigned int cfg, delta; + unsigned int delta; int lost_ints = -1; - if (unlikely(!hpet_rtc_flags)) { - cfg = hpet_readl(HPET_T1_CFG); - cfg &= ~HPET_TN_ENABLE; - hpet_writel(cfg, HPET_T1_CFG); - return; - } + if (unlikely(!hpet_rtc_flags)) + hpet_disable_rtc_channel(); if (!(hpet_rtc_flags & RTC_PIE) || hpet_pie_limit) delta = hpet_default_delta; -- cgit v1.1 From d059f24a9680805bd73fc5675504d465bfe28b72 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Tue, 29 Nov 2011 20:14:43 +0100 Subject: x86, CPU: Drop superfluous get_cpu_cap() prototype The get_cpu_cap() external function prototype was declared twice so lose one of them. Clean up the header guard while at it. Signed-off-by: Borislav Petkov Link: http://lkml.kernel.org/r/1322594083-14507-1-git-send-email-bp@amd64.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/cpu.h | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/cpu.h b/arch/x86/kernel/cpu/cpu.h index 1b22dcc..8bacc78 100644 --- a/arch/x86/kernel/cpu/cpu.h +++ b/arch/x86/kernel/cpu/cpu.h @@ -1,5 +1,4 @@ #ifndef ARCH_X86_CPU_H - #define ARCH_X86_CPU_H struct cpu_model_info { @@ -35,6 +34,4 @@ extern const struct cpu_dev *const __x86_cpu_dev_start[], extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); -extern void get_cpu_cap(struct cpuinfo_x86 *c); - -#endif +#endif /* ARCH_X86_CPU_H */ -- cgit v1.1 From e8c7106280a305e1ff2a3a8a4dfce141469fb039 Mon Sep 17 00:00:00 2001 From: Matt Fleming Date: Fri, 18 Nov 2011 13:09:11 +0000 Subject: x86, efi: Calling __pa() with an ioremap()ed address is invalid If we encounter an efi_memory_desc_t without EFI_MEMORY_WB set in ->attribute we currently call set_memory_uc(), which in turn calls __pa() on a potentially ioremap'd address. On CONFIG_X86_32 this is invalid, resulting in the following oops on some machines: BUG: unable to handle kernel paging request at f7f22280 IP: [] reserve_ram_pages_type+0x89/0x210 [...] Call Trace: [] ? page_is_ram+0x1a/0x40 [] reserve_memtype+0xdf/0x2f0 [] set_memory_uc+0x49/0xa0 [] efi_enter_virtual_mode+0x1c2/0x3aa [] start_kernel+0x291/0x2f2 [] ? loglevel+0x1b/0x1b [] i386_start_kernel+0xbf/0xc8 A better approach to this problem is to map the memory region with the correct attributes from the start, instead of modifying it after the fact. The uncached case can be handled by ioremap_nocache() and the cached by ioremap_cache(). Despite first impressions, it's not possible to use ioremap_cache() to map all cached memory regions on CONFIG_X86_64 because EFI_RUNTIME_SERVICES_DATA regions really don't like being mapped into the vmalloc space, as detailed in the following bug report, https://bugzilla.redhat.com/show_bug.cgi?id=748516 Therefore, we need to ensure that any EFI_RUNTIME_SERVICES_DATA regions are covered by the direct kernel mapping table on CONFIG_X86_64. To accomplish this we now map E820_RESERVED_EFI regions via the direct kernel mapping with the initial call to init_memory_mapping() in setup_arch(), whereas previously these regions wouldn't be mapped if they were after the last E820_RAM region until efi_ioremap() was called. Doing it this way allows us to delete efi_ioremap() completely. Signed-off-by: Matt Fleming Cc: H. Peter Anvin Cc: Matthew Garrett Cc: Zhang Rui Cc: Huang Ying Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1321621751-3650-1-git-send-email-matt@console-pimps.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 3 ++- arch/x86/kernel/setup.c | 21 ++++++++++++++++++++- 2 files changed, 22 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 303a0e4..65ffd11 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -135,6 +135,7 @@ static void __init e820_print_type(u32 type) printk(KERN_CONT "(usable)"); break; case E820_RESERVED: + case E820_RESERVED_EFI: printk(KERN_CONT "(reserved)"); break; case E820_ACPI: @@ -783,7 +784,7 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) /* * Find the highest page frame number we have available */ -static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) +unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) { int i; unsigned long last_pfn = 0; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cf0ef98..9a9e40f 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -691,6 +691,8 @@ early_param("reservelow", parse_reservelow); void __init setup_arch(char **cmdline_p) { + unsigned long end_pfn; + #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); @@ -932,7 +934,24 @@ void __init setup_arch(char **cmdline_p) init_gbpages(); /* max_pfn_mapped is updated here */ - max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn<>PAGE_SHIFT, E820_RESERVED_EFI); + if (efi_end > max_low_pfn) + end_pfn = efi_end; + } +#endif + + max_low_pfn_mapped = init_memory_mapping(0, end_pfn << PAGE_SHIFT); max_pfn_mapped = max_low_pfn_mapped; #ifdef CONFIG_X86_64 -- cgit v1.1 From 280f06774afedf849f0b34248ed6aff57d0f6908 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 18:22:06 +0200 Subject: nohz: Separate out irq exit and idle loop dyntick logic The tick_nohz_stop_sched_tick() function, which tries to delay the next timer tick as long as possible, can be called from two places: - From the idle loop to start the dytick idle mode - From interrupt exit if we have interrupted the dyntick idle mode, so that we reprogram the next tick event in case the irq changed some internal state that requires this action. There are only few minor differences between both that are handled by that function, driven by the ts->inidle cpu variable and the inidle parameter. The whole guarantees that we only update the dyntick mode on irq exit if we actually interrupted the dyntick idle mode, and that we enter in RCU extended quiescent state from idle loop entry only. Split this function into: - tick_nohz_idle_enter(), which sets ts->inidle to 1, enters dynticks idle mode unconditionally if it can, and enters into RCU extended quiescent state. - tick_nohz_irq_exit() which only updates the dynticks idle mode when ts->inidle is set (ie: if tick_nohz_idle_enter() has been called). To maintain symmetry, tick_nohz_restart_sched_tick() has been renamed into tick_nohz_idle_exit(). This simplifies the code and micro-optimize the irq exit path (no need for local_irq_save there). This also prepares for the split between dynticks and rcu extended quiescent state logics. We'll need this split to further fix illegal uses of RCU in extended quiescent states in the idle loop. Signed-off-by: Frederic Weisbecker Cc: Mike Frysinger Cc: Guan Xuetao Cc: David Miller Cc: Chris Metcalf Cc: Hans-Christian Egtvedt Cc: Ralf Baechle Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Russell King Cc: Paul Mackerras Cc: Heiko Carstens Cc: Paul Mundt Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- arch/x86/kernel/process_32.c | 4 ++-- arch/x86/kernel/process_64.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 795b79f..6d9d4d5 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -99,7 +99,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) { check_pgt_cache(); @@ -116,7 +116,7 @@ void cpu_idle(void) pm_idle(); start_critical_timings(); } - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 3bd7e6e..b069e9d 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -122,7 +122,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_stop_sched_tick(1); + tick_nohz_idle_enter(); while (!need_resched()) { rmb(); @@ -149,7 +149,7 @@ void cpu_idle(void) __exit_idle(); } - tick_nohz_restart_sched_tick(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); -- cgit v1.1 From 2bbb6817c0ac1b5f2a68d720f364f98eeb1ac4fd Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Oct 2011 16:01:00 +0200 Subject: nohz: Allow rcu extended quiescent state handling seperately from tick stop It is assumed that rcu won't be used once we switch to tickless mode and until we restart the tick. However this is not always true, as in x86-64 where we dereference the idle notifiers after the tick is stopped. To prepare for fixing this, add two new APIs: tick_nohz_idle_enter_norcu() and tick_nohz_idle_exit_norcu(). If no use of RCU is made in the idle loop between tick_nohz_enter_idle() and tick_nohz_exit_idle() calls, the arch must instead call the new *_norcu() version such that the arch doesn't need to call rcu_idle_enter() and rcu_idle_exit(). Otherwise the arch must call tick_nohz_enter_idle() and tick_nohz_exit_idle() and also call explicitly: - rcu_idle_enter() after its last use of RCU before the CPU is put to sleep. - rcu_idle_exit() before the first use of RCU after the CPU is woken up. Signed-off-by: Frederic Weisbecker Cc: Mike Frysinger Cc: Guan Xuetao Cc: David Miller Cc: Chris Metcalf Cc: Hans-Christian Egtvedt Cc: Ralf Baechle Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Russell King Cc: Paul Mackerras Cc: Heiko Carstens Cc: Paul Mundt Signed-off-by: Paul E. McKenney --- arch/x86/kernel/process_32.c | 4 ++-- arch/x86/kernel/process_64.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 6d9d4d5..f94da39 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -99,7 +99,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { check_pgt_cache(); @@ -116,7 +116,7 @@ void cpu_idle(void) pm_idle(); start_critical_timings(); } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b069e9d..18e8cf3 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -122,7 +122,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter(); + tick_nohz_idle_enter_norcu(); while (!need_resched()) { rmb(); @@ -149,7 +149,7 @@ void cpu_idle(void) __exit_idle(); } - tick_nohz_idle_exit(); + tick_nohz_idle_exit_norcu(); preempt_enable_no_resched(); schedule(); preempt_disable(); -- cgit v1.1 From e37e112de3ac64032df45c2db0dbe1e8f1af86b4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 18:22:08 +0200 Subject: x86: Enter rcu extended qs after idle notifier call The idle notifier, called by enter_idle(), enters into rcu read side critical section but at that time we already switched into the RCU-idle window (rcu_idle_enter() has been called). And it's illegal to use rcu_read_lock() in that state. This results in rcu reporting its bad mood: [ 1.275635] WARNING: at include/linux/rcupdate.h:194 __atomic_notifier_call_chain+0xd2/0x110() [ 1.275635] Hardware name: AMD690VM-FMH [ 1.275635] Modules linked in: [ 1.275635] Pid: 0, comm: swapper Not tainted 3.0.0-rc6+ #252 [ 1.275635] Call Trace: [ 1.275635] [] warn_slowpath_common+0x7a/0xb0 [ 1.275635] [] warn_slowpath_null+0x15/0x20 [ 1.275635] [] __atomic_notifier_call_chain+0xd2/0x110 [ 1.275635] [] atomic_notifier_call_chain+0x11/0x20 [ 1.275635] [] enter_idle+0x20/0x30 [ 1.275635] [] cpu_idle+0xa5/0x110 [ 1.275635] [] rest_init+0xe5/0x140 [ 1.275635] [] ? rest_init+0x48/0x140 [ 1.275635] [] start_kernel+0x3d1/0x3dc [ 1.275635] [] x86_64_start_reservations+0x131/0x135 [ 1.275635] [] x86_64_start_kernel+0xed/0xf4 [ 1.275635] ---[ end trace a22d306b065d4a66 ]--- Fix this by entering rcu extended quiescent state later, just before the CPU goes to sleep. Signed-off-by: Frederic Weisbecker Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- arch/x86/kernel/process_64.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 18e8cf3..64e926c 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -122,7 +122,7 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); while (!need_resched()) { rmb(); @@ -139,8 +139,14 @@ void cpu_idle(void) enter_idle(); /* Don't trace irqs off for idle */ stop_critical_timings(); + + /* enter_idle() needs rcu for notifiers */ + rcu_idle_enter(); + if (cpuidle_idle_call()) pm_idle(); + + rcu_idle_exit(); start_critical_timings(); /* In many cases the interrupt that ended idle @@ -149,7 +155,7 @@ void cpu_idle(void) __exit_idle(); } - tick_nohz_idle_exit_norcu(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); -- cgit v1.1 From 98ad1cc14a5c4fd658f9d72c6ba5c86dfd3ce0d5 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Oct 2011 18:22:09 +0200 Subject: x86: Call idle notifier after irq_enter() Interrupts notify the idle exit state before calling irq_enter(). But the notifier code calls rcu_read_lock() and this is not allowed while rcu is in an extended quiescent state. We need to wait for irq_enter() -> rcu_idle_exit() to be called before doing so otherwise this results in a grumpy RCU: [ 0.099991] WARNING: at include/linux/rcupdate.h:194 __atomic_notifier_call_chain+0xd2/0x110() [ 0.099991] Hardware name: AMD690VM-FMH [ 0.099991] Modules linked in: [ 0.099991] Pid: 0, comm: swapper Not tainted 3.0.0-rc6+ #255 [ 0.099991] Call Trace: [ 0.099991] [] warn_slowpath_common+0x7a/0xb0 [ 0.099991] [] warn_slowpath_null+0x15/0x20 [ 0.099991] [] __atomic_notifier_call_chain+0xd2/0x110 [ 0.099991] [] atomic_notifier_call_chain+0x11/0x20 [ 0.099991] [] exit_idle+0x43/0x50 [ 0.099991] [] smp_apic_timer_interrupt+0x39/0xa0 [ 0.099991] [] apic_timer_interrupt+0x13/0x20 [ 0.099991] [] ? default_idle+0xa7/0x350 [ 0.099991] [] ? default_idle+0xa5/0x350 [ 0.099991] [] amd_e400_idle+0x8b/0x110 [ 0.099991] [] ? rcu_enter_nohz+0x8f/0x160 [ 0.099991] [] cpu_idle+0xb0/0x110 [ 0.099991] [] rest_init+0xe5/0x140 [ 0.099991] [] ? rest_init+0x48/0x140 [ 0.099991] [] start_kernel+0x3d1/0x3dc [ 0.099991] [] x86_64_start_reservations+0x131/0x135 [ 0.099991] [] x86_64_start_kernel+0xed/0xf4 Signed-off-by: Frederic Weisbecker Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: H. Peter Anvin Cc: Andy Henroid Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett --- arch/x86/kernel/apic/apic.c | 6 +++--- arch/x86/kernel/apic/io_apic.c | 2 +- arch/x86/kernel/cpu/mcheck/therm_throt.c | 2 +- arch/x86/kernel/cpu/mcheck/threshold.c | 2 +- arch/x86/kernel/irq.c | 6 +++--- 5 files changed, 9 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f98d84c..2cd2d93 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -876,8 +876,8 @@ void __irq_entry smp_apic_timer_interrupt(struct pt_regs *regs) * Besides, if we don't timer interrupts ignore the global * interrupt lock, which is the WrongThing (tm) to do. */ - exit_idle(); irq_enter(); + exit_idle(); local_apic_timer_interrupt(); irq_exit(); @@ -1809,8 +1809,8 @@ void smp_spurious_interrupt(struct pt_regs *regs) { u32 v; - exit_idle(); irq_enter(); + exit_idle(); /* * Check if this really is a spurious interrupt and ACK it * if it is a vectored one. Just in case... @@ -1846,8 +1846,8 @@ void smp_error_interrupt(struct pt_regs *regs) "Illegal register address", /* APIC Error Bit 7 */ }; - exit_idle(); irq_enter(); + exit_idle(); /* First tickle the hardware, only then report what went on. -- REW */ v0 = apic_read(APIC_ESR); apic_write(APIC_ESR, 0); diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 6d939d7..8980555 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2421,8 +2421,8 @@ asmlinkage void smp_irq_move_cleanup_interrupt(void) unsigned vector, me; ack_APIC_irq(); - exit_idle(); irq_enter(); + exit_idle(); me = smp_processor_id(); for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 787e06c..ce21561 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -397,8 +397,8 @@ static void (*smp_thermal_vector)(void) = unexpected_thermal_interrupt; asmlinkage void smp_thermal_interrupt(struct pt_regs *regs) { - exit_idle(); irq_enter(); + exit_idle(); inc_irq_stat(irq_thermal_count); smp_thermal_vector(); irq_exit(); diff --git a/arch/x86/kernel/cpu/mcheck/threshold.c b/arch/x86/kernel/cpu/mcheck/threshold.c index d746df2..aa578ca 100644 --- a/arch/x86/kernel/cpu/mcheck/threshold.c +++ b/arch/x86/kernel/cpu/mcheck/threshold.c @@ -19,8 +19,8 @@ void (*mce_threshold_vector)(void) = default_threshold_interrupt; asmlinkage void smp_threshold_interrupt(void) { - exit_idle(); irq_enter(); + exit_idle(); inc_irq_stat(irq_threshold_count); mce_threshold_vector(); irq_exit(); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 429e0c9..5d31e5b 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -181,8 +181,8 @@ unsigned int __irq_entry do_IRQ(struct pt_regs *regs) unsigned vector = ~regs->orig_ax; unsigned irq; - exit_idle(); irq_enter(); + exit_idle(); irq = __this_cpu_read(vector_irq[vector]); @@ -209,10 +209,10 @@ void smp_x86_platform_ipi(struct pt_regs *regs) ack_APIC_irq(); - exit_idle(); - irq_enter(); + exit_idle(); + inc_irq_stat(x86_platform_ipis); if (x86_platform_ipi_callback) -- cgit v1.1 From 1268fbc746ea1cd279886a740dcbad4ba5232225 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 17 Nov 2011 18:48:14 +0100 Subject: nohz: Remove tick_nohz_idle_enter_norcu() / tick_nohz_idle_exit_norcu() Those two APIs were provided to optimize the calls of tick_nohz_idle_enter() and rcu_idle_enter() into a single irq disabled section. This way no interrupt happening in-between would needlessly process any RCU job. Now we are talking about an optimization for which benefits have yet to be measured. Let's start simple and completely decouple idle rcu and dyntick idle logics to simplify. Signed-off-by: Frederic Weisbecker Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Reviewed-by: Josh Triplett Signed-off-by: Paul E. McKenney --- arch/x86/kernel/process_32.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index f94da39..485204f 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -99,7 +99,8 @@ void cpu_idle(void) /* endless idle loop with no priority at all */ while (1) { - tick_nohz_idle_enter_norcu(); + tick_nohz_idle_enter(); + rcu_idle_enter(); while (!need_resched()) { check_pgt_cache(); @@ -116,7 +117,8 @@ void cpu_idle(void) pm_idle(); start_critical_timings(); } - tick_nohz_idle_exit_norcu(); + rcu_idle_exit(); + tick_nohz_idle_exit(); preempt_enable_no_resched(); schedule(); preempt_disable(); -- cgit v1.1 From e1ad783b12ec8b69da83479c5d21a0d8180bc519 Mon Sep 17 00:00:00 2001 From: Keith Packard Date: Sun, 11 Dec 2011 16:12:42 -0800 Subject: Revert "x86, efi: Calling __pa() with an ioremap()ed address is invalid" This hangs my MacBook Air at boot time; I get no console messages at all. I reverted this on top of -rc5 and my machine boots again. This reverts commit e8c7106280a305e1ff2a3a8a4dfce141469fb039. Signed-off-by: Matt Fleming Signed-off-by: Keith Packard Acked-by: H. Peter Anvin Cc: Matthew Garrett Cc: Zhang Rui Cc: Huang Ying Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/1321621751-3650-1-git-send-email-matt@console Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 3 +-- arch/x86/kernel/setup.c | 21 +-------------------- 2 files changed, 2 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 65ffd11..303a0e4 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -135,7 +135,6 @@ static void __init e820_print_type(u32 type) printk(KERN_CONT "(usable)"); break; case E820_RESERVED: - case E820_RESERVED_EFI: printk(KERN_CONT "(reserved)"); break; case E820_ACPI: @@ -784,7 +783,7 @@ u64 __init early_reserve_e820(u64 startt, u64 sizet, u64 align) /* * Find the highest page frame number we have available */ -unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) +static unsigned long __init e820_end_pfn(unsigned long limit_pfn, unsigned type) { int i; unsigned long last_pfn = 0; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 9a9e40f..cf0ef98 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -691,8 +691,6 @@ early_param("reservelow", parse_reservelow); void __init setup_arch(char **cmdline_p) { - unsigned long end_pfn; - #ifdef CONFIG_X86_32 memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); visws_early_detect(); @@ -934,24 +932,7 @@ void __init setup_arch(char **cmdline_p) init_gbpages(); /* max_pfn_mapped is updated here */ - end_pfn = max_low_pfn; - -#ifdef CONFIG_X86_64 - /* - * There may be regions after the last E820_RAM region that we - * want to include in the kernel direct mapping, such as - * EFI_RUNTIME_SERVICES_DATA. - */ - if (efi_enabled) { - unsigned long efi_end; - - efi_end = e820_end_pfn(MAXMEM>>PAGE_SHIFT, E820_RESERVED_EFI); - if (efi_end > max_low_pfn) - end_pfn = efi_end; - } -#endif - - max_low_pfn_mapped = init_memory_mapping(0, end_pfn << PAGE_SHIFT); + max_low_pfn_mapped = init_memory_mapping(0, max_low_pfn< Date: Tue, 13 Dec 2011 11:51:53 +0900 Subject: x86: Add per-cpu stat counter for APIC ICR read tries MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the IPI delivery slow path (NMI delivery) we retry the ICR read to check for delivery completion a limited number of times. [ The reason for the limited retries is that some of the places where it is used (cpu boot, kdump, etc) IPI delivery might not succeed (due to a firmware bug or system crash, for example) and in such a case it is better to give up and resume execution of other code. ] This patch adds a new entry to /proc/interrupts, RTR, which tells user space the number of times we retried the ICR read in the IPI delivery slow path. This should give some insight into how well the APIC message delivery hardware is working - if the counts are way too large then we are hitting a (very-) slow path way too often. Signed-off-by: Fernando Luis Vazquez Cao Cc: Jörn Engel Cc: Suresh Siddha Link: http://lkml.kernel.org/n/tip-vzsp20lo2xdzh5f70g0eis2s@git.kernel.org [ extended the changelog ] Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 6 ++++++ arch/x86/kernel/irq.c | 5 +++++ 2 files changed, 11 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index f98d84c..2942794 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -79,6 +79,11 @@ DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); +/* + * ICR read retry counter + */ +DEFINE_PER_CPU(unsigned, icr_read_retry_count); + #ifdef CONFIG_X86_32 /* @@ -250,6 +255,7 @@ u32 native_safe_apic_wait_icr_idle(void) send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; if (!send_status) break; + percpu_inc(icr_read_retry_count); udelay(100); } while (timeout++ < 1000); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 429e0c9..4bbf162 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -74,6 +74,10 @@ int arch_show_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_irq_work_irqs); seq_printf(p, " IRQ work interrupts\n"); + seq_printf(p, "%*s: ", prec, "RTR"); + for_each_online_cpu(j) + seq_printf(p, "%10u ", per_cpu(icr_read_retry_count, j)); + seq_printf(p, " APIC ICR read retries\n"); #endif if (x86_platform_ipi_callback) { seq_printf(p, "%*s: ", prec, "PLT"); @@ -136,6 +140,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_spurious_count; sum += irq_stats(cpu)->apic_perf_irqs; sum += irq_stats(cpu)->apic_irq_work_irqs; + sum += per_cpu(icr_read_retry_count, cpu); #endif if (x86_platform_ipi_callback) sum += irq_stats(cpu)->x86_platform_ipis; -- cgit v1.1 From f72c1a576565a4927d650218e183ab5053ab8c3a Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 2 Dec 2011 16:50:04 +0100 Subject: x86, microcode, AMD: Add a vendor-specific exit function This will be used to do cleanup work before the driver exits. Signed-off-by: Borislav Petkov --- arch/x86/kernel/microcode_amd.c | 4 ++++ arch/x86/kernel/microcode_core.c | 5 +++++ 2 files changed, 9 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index d494799..e8a68c2 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -353,3 +353,7 @@ struct microcode_ops * __init init_amd_microcode(void) { return µcode_amd_ops; } + +void __exit exit_amd_microcode(void) +{ +} diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 9d46f5e..9302e2d 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -563,6 +563,8 @@ module_init(microcode_init); static void __exit microcode_exit(void) { + struct cpuinfo_x86 *c = &cpu_data(0); + microcode_dev_exit(); unregister_hotcpu_notifier(&mc_cpu_notifier); @@ -580,6 +582,9 @@ static void __exit microcode_exit(void) microcode_ops = NULL; + if (c->x86_vendor == X86_VENDOR_AMD) + exit_amd_microcode(); + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); } module_exit(microcode_exit); -- cgit v1.1 From 96b0ee4588036b6fa7cf38c17a9e40531241e895 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 2 Dec 2011 17:16:55 +0100 Subject: x86, microcode, AMD: Add a reusable buffer Add a simple 4K page which gets allocated on driver init and freed on driver exit instead of vmalloc'ing small buffers for each ucode patch. Signed-off-by: Borislav Petkov --- arch/x86/kernel/microcode_amd.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index e8a68c2..9129c69 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -71,6 +71,9 @@ struct microcode_amd { static struct equiv_cpu_entry *equiv_cpu_table; +/* page-sized ucode patch buffer */ +void *patch; + static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu); @@ -351,9 +354,14 @@ static struct microcode_ops microcode_amd_ops = { struct microcode_ops * __init init_amd_microcode(void) { + patch = (void *)get_zeroed_page(GFP_KERNEL); + if (!patch) + return NULL; + return µcode_amd_ops; } void __exit exit_amd_microcode(void) { + free_page((unsigned long)patch); } -- cgit v1.1 From be62adb492943ce2525ff19401b389a85006ad15 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 2 Dec 2011 18:02:17 +0100 Subject: x86, microcode, AMD: Simplify ucode verification Basically, what we did until now is take out a chunk of the firmware image, vmalloc space for it and inspect it before application. And repeat. This patch changes all that so that we look at each ucode patch from the firmware image, check it for sanity and copy it to local buffer for application only once and if it passes all checks. Thus, vmalloc-ing for each piece is gone, we can do proper size checking only of the patch which is destined for the CPU of the current machine instead of each single patch, which is clearly wrong. Oh yeah, simplify and cleanup the code while at it, along with adding comments as to what actually happens. Signed-off-by: Borislav Petkov --- arch/x86/kernel/microcode_amd.c | 179 +++++++++++++++++++++------------------- 1 file changed, 93 insertions(+), 86 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 9129c69..384990d 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -89,27 +89,76 @@ static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) return 0; } -static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr, - int rev) +static unsigned int verify_ucode_size(int cpu, u32 patch_size, + unsigned int size) { - unsigned int current_cpu_id; - u16 equiv_cpu_id = 0; - unsigned int i = 0; + struct cpuinfo_x86 *c = &cpu_data(cpu); + u32 max_size; + +#define F1XH_MPB_MAX_SIZE 2048 +#define F14H_MPB_MAX_SIZE 1824 +#define F15H_MPB_MAX_SIZE 4096 + + switch (c->x86) { + case 0x14: + max_size = F14H_MPB_MAX_SIZE; + break; + case 0x15: + max_size = F15H_MPB_MAX_SIZE; + break; + default: + max_size = F1XH_MPB_MAX_SIZE; + break; + } + + if (patch_size > min_t(u32, size, max_size)) { + pr_err("patch size mismatch\n"); + return 0; + } + + return patch_size; +} + +static u16 find_equiv_id(void) +{ + unsigned int current_cpu_id, i = 0; BUG_ON(equiv_cpu_table == NULL); + current_cpu_id = cpuid_eax(0x00000001); while (equiv_cpu_table[i].installed_cpu != 0) { - if (current_cpu_id == equiv_cpu_table[i].installed_cpu) { - equiv_cpu_id = equiv_cpu_table[i].equiv_cpu; - break; - } + if (current_cpu_id == equiv_cpu_table[i].installed_cpu) + return equiv_cpu_table[i].equiv_cpu; + i++; } + return 0; +} +/* + * we signal a good patch is found by returning its size > 0 + */ +static int get_matching_microcode(int cpu, const u8 *ucode_ptr, + unsigned int leftover_size, int rev, + unsigned int *current_size) +{ + struct microcode_header_amd *mc_hdr; + unsigned int actual_size; + u16 equiv_cpu_id; + + /* size of the current patch we're staring at */ + *current_size = *(u32 *)(ucode_ptr + 4) + SECTION_HDR_SIZE; + + equiv_cpu_id = find_equiv_id(); if (!equiv_cpu_id) return 0; + /* + * let's look at the patch header itself now + */ + mc_hdr = (struct microcode_header_amd *)(ucode_ptr + SECTION_HDR_SIZE); + if (mc_hdr->processor_rev_id != equiv_cpu_id) return 0; @@ -123,7 +172,20 @@ static int get_matching_microcode(int cpu, struct microcode_header_amd *mc_hdr, if (mc_hdr->patch_id <= rev) return 0; - return 1; + /* + * now that the header looks sane, verify its size + */ + actual_size = verify_ucode_size(cpu, *current_size, leftover_size); + if (!actual_size) + return 0; + + /* clear the patch buffer */ + memset(patch, 0, PAGE_SIZE); + + /* all looks ok, get the binary patch */ + get_ucode_data(patch, ucode_ptr + SECTION_HDR_SIZE, actual_size); + + return actual_size; } static int apply_microcode_amd(int cpu) @@ -158,63 +220,6 @@ static int apply_microcode_amd(int cpu) return 0; } -static unsigned int verify_ucode_size(int cpu, const u8 *buf, unsigned int size) -{ - struct cpuinfo_x86 *c = &cpu_data(cpu); - u32 max_size, actual_size; - -#define F1XH_MPB_MAX_SIZE 2048 -#define F14H_MPB_MAX_SIZE 1824 -#define F15H_MPB_MAX_SIZE 4096 - - switch (c->x86) { - case 0x14: - max_size = F14H_MPB_MAX_SIZE; - break; - case 0x15: - max_size = F15H_MPB_MAX_SIZE; - break; - default: - max_size = F1XH_MPB_MAX_SIZE; - break; - } - - actual_size = *(u32 *)(buf + 4); - - if (actual_size + SECTION_HDR_SIZE > size || actual_size > max_size) { - pr_err("section size mismatch\n"); - return 0; - } - - return actual_size; -} - -static struct microcode_header_amd * -get_next_ucode(int cpu, const u8 *buf, unsigned int size, unsigned int *mc_size) -{ - struct microcode_header_amd *mc = NULL; - unsigned int actual_size = 0; - - if (*(u32 *)buf != UCODE_UCODE_TYPE) { - pr_err("invalid type field in container file section header\n"); - goto out; - } - - actual_size = verify_ucode_size(cpu, buf, size); - if (!actual_size) - goto out; - - mc = vzalloc(actual_size); - if (!mc) - goto out; - - get_ucode_data(mc, buf + SECTION_HDR_SIZE, actual_size); - *mc_size = actual_size + SECTION_HDR_SIZE; - -out: - return mc; -} - static int install_equiv_cpu_table(const u8 *buf) { unsigned int *ibuf = (unsigned int *)buf; @@ -250,36 +255,38 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; struct microcode_header_amd *mc_hdr = NULL; - unsigned int mc_size, leftover; + unsigned int mc_size, leftover, current_size = 0; int offset; const u8 *ucode_ptr = data; void *new_mc = NULL; unsigned int new_rev = uci->cpu_sig.rev; - enum ucode_state state = UCODE_OK; + enum ucode_state state = UCODE_ERROR; offset = install_equiv_cpu_table(ucode_ptr); if (offset < 0) { pr_err("failed to create equivalent cpu table\n"); - return UCODE_ERROR; + goto out; } - ucode_ptr += offset; leftover = size - offset; - while (leftover) { - mc_hdr = get_next_ucode(cpu, ucode_ptr, leftover, &mc_size); - if (!mc_hdr) - break; + if (*(u32 *)ucode_ptr != UCODE_UCODE_TYPE) { + pr_err("invalid type field in container file section header\n"); + goto free_table; + } - if (get_matching_microcode(cpu, mc_hdr, new_rev)) { - vfree(new_mc); + while (leftover) { + mc_size = get_matching_microcode(cpu, ucode_ptr, leftover, + new_rev, ¤t_size); + if (mc_size) { + mc_hdr = patch; + new_mc = patch; new_rev = mc_hdr->patch_id; - new_mc = mc_hdr; - } else - vfree(mc_hdr); - - ucode_ptr += mc_size; - leftover -= mc_size; + leftover -= mc_size; + } else { + ucode_ptr += current_size; + leftover -= current_size; + } } if (!new_mc) { @@ -288,18 +295,19 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) } if (!leftover) { - vfree(uci->mc); uci->mc = new_mc; + state = UCODE_OK; pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", cpu, uci->cpu_sig.rev, new_rev); } else { - vfree(new_mc); + new_mc = NULL; state = UCODE_ERROR; } free_table: free_equiv_cpu_table(); +out: return state; } @@ -340,7 +348,6 @@ static void microcode_fini_cpu_amd(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - vfree(uci->mc); uci->mc = NULL; } -- cgit v1.1 From d733689ad57ec332fb1e392115d83a75f35df1cf Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Wed, 7 Dec 2011 17:26:56 +0100 Subject: x86, microcode, AMD: Exit early on success Once we've found and validated the ucode patch for the current CPU, there's no need to iterate over the remaining patches in the binary image. Exit then and save us a bunch of cycles. Signed-off-by: Borislav Petkov --- arch/x86/kernel/microcode_amd.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 384990d..d80e943 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -282,11 +282,11 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) mc_hdr = patch; new_mc = patch; new_rev = mc_hdr->patch_id; - leftover -= mc_size; - } else { - ucode_ptr += current_size; - leftover -= current_size; + goto out_ok; } + + ucode_ptr += current_size; + leftover -= current_size; } if (!new_mc) { @@ -294,15 +294,11 @@ generic_load_microcode(int cpu, const u8 *data, size_t size) goto free_table; } - if (!leftover) { - uci->mc = new_mc; - state = UCODE_OK; - pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", - cpu, uci->cpu_sig.rev, new_rev); - } else { - new_mc = NULL; - state = UCODE_ERROR; - } +out_ok: + uci->mc = new_mc; + state = UCODE_OK; + pr_debug("CPU%d update ucode (0x%08x -> 0x%08x)\n", + cpu, uci->cpu_sig.rev, new_rev); free_table: free_equiv_cpu_table(); -- cgit v1.1 From 597e11a367e8fd942b75b8e5117902ebce939474 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Fri, 2 Dec 2011 18:09:23 +0100 Subject: x86, microcode, AMD: Update copyrights Add Andreas and me as current maintainers. Signed-off-by: Borislav Petkov --- arch/x86/kernel/microcode_amd.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index d80e943..fe86493 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -1,14 +1,18 @@ /* * AMD CPU Microcode Update Driver for Linux - * Copyright (C) 2008 Advanced Micro Devices Inc. + * Copyright (C) 2008-2011 Advanced Micro Devices Inc. * * Author: Peter Oruba * * Based on work by: * Tigran Aivazian * - * This driver allows to upgrade microcode on AMD - * family 0x10 and 0x11 processors. + * Maintainers: + * Andreas Herrmann + * Borislav Petkov + * + * This driver allows to upgrade microcode on F10h AMD + * CPUs and later. * * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. -- cgit v1.1 From 3653ada5d3e173489b3a466305687cb5c44b2ab1 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 4 Dec 2011 15:12:09 +0100 Subject: x86, mce: Add wrappers for registering on the decode chain No functionality change, this is done so that in a follow-on patch all queued-up MCEs can be decoded after registering on the chain. Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2af127d..c3c66ac 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -95,13 +95,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; -/* - * CPU/chipset specific EDAC code can register a notifier call here to print - * MCE errors in a human-readable form. - */ -ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); -EXPORT_SYMBOL_GPL(x86_mce_decoder_chain); - /* MCA banks polled by the period polling timer for corrected events */ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { [0 ... BITS_TO_LONGS(MAX_NR_BANKS)-1] = ~0UL @@ -109,6 +102,12 @@ DEFINE_PER_CPU(mce_banks_t, mce_poll_banks) = { static DEFINE_PER_CPU(struct work_struct, mce_work); +/* + * CPU/chipset specific EDAC code can register a notifier call here to print + * MCE errors in a human-readable form. + */ +ATOMIC_NOTIFIER_HEAD(x86_mce_decoder_chain); + /* Do initial initialization of a struct mce */ void mce_setup(struct mce *m) { @@ -190,6 +189,18 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } +void mce_register_decode_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_register_decode_chain); + +void mce_unregister_decode_chain(struct notifier_block *nb) +{ + atomic_notifier_chain_unregister(&x86_mce_decoder_chain, nb); +} +EXPORT_SYMBOL_GPL(mce_unregister_decode_chain); + static void print_mce(struct mce *m) { int ret = 0; -- cgit v1.1 From 0937195715713b37ec843f28d99930dd7b1e8fbe Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Thu, 8 Dec 2011 12:28:33 +0100 Subject: x86, MCE: Drain mcelog buffer Add a function which drains whatever MCEs were logged in already during boot and before the decoder chains were registered. Signed-off-by: Borislav Petkov --- arch/x86/kernel/cpu/mcheck/mce.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index c3c66ac..5be2464 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -189,9 +189,48 @@ void mce_log(struct mce *mce) set_bit(0, &mce_need_notify); } +static void drain_mcelog_buffer(void) +{ + unsigned int next, i, prev = 0; + + next = rcu_dereference_check_mce(mcelog.next); + + do { + struct mce *m; + + /* drain what was logged during boot */ + for (i = prev; i < next; i++) { + unsigned long start = jiffies; + unsigned retries = 1; + + m = &mcelog.entry[i]; + + while (!m->finished) { + if (time_after_eq(jiffies, start + 2*retries)) + retries++; + + cpu_relax(); + + if (!m->finished && retries >= 4) { + pr_err("MCE: skipping error being logged currently!\n"); + break; + } + } + smp_rmb(); + atomic_notifier_call_chain(&x86_mce_decoder_chain, 0, m); + } + + memset(mcelog.entry + prev, 0, (next - prev) * sizeof(*m)); + prev = next; + next = cmpxchg(&mcelog.next, prev, 0); + } while (next != prev); +} + + void mce_register_decode_chain(struct notifier_block *nb) { atomic_notifier_chain_register(&x86_mce_decoder_chain, nb); + drain_mcelog_buffer(); } EXPORT_SYMBOL_GPL(mce_register_decode_chain); -- cgit v1.1 From 29e9bf1841e4f9df13b4992a716fece7087dd237 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 4 Nov 2011 13:31:23 -0700 Subject: x86, mce, therm_throt: Don't report power limit and package level thermal throttle events in mcelog Thermal throttle and power limit events are not defined as MCE errors in x86 architecture and should not generate MCE errors in mcelog. Current kernel generates fake software defined MCE errors for these events. This may confuse users because they may think the machine has real MCE errors while actually only thermal throttle or power limit events happen. To make it worse, buggy firmware on some platforms may falsely generate the events. Therefore, kernel reports MCE errors which users think as real hardware errors. Although the firmware bugs should be fixed, on the other hand, kernel should not report MCE errors either. So mcelog is not a good mechanism to report these events. To report the events, we count them in respective counters (core_power_limit_count, package_power_limit_count, core_throttle_count, and package_throttle_count) in /sys/devices/system/cpu/cpu#/thermal_throttle/. Users can check the counters for each event on each CPU. Please note that all CPU's on one package report duplicate counters. It's user application's responsibity to retrieve a package level counter for one package. This patch doesn't report package level power limit, core level power limit, and package level thermal throttle events in mcelog. When the events happen, only report them in respective counters in sysfs. Since core level thermal throttle has been legacy code in kernel for a while and users accepted it as MCE error in mcelog, core level thermal throttle is still reported in mcelog. In the mean time, the event is counted in a counter in sysfs as well. Signed-off-by: Fenghua Yu Acked-by: Borislav Petkov Acked-by: Tony Luck Link: http://lkml.kernel.org/r/20111215001945.GA21009@linux-os.sc.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mcheck/therm_throt.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 787e06c..ce04b58 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -323,17 +323,6 @@ device_initcall(thermal_throttle_init_device); #endif /* CONFIG_SYSFS */ -/* - * Set up the most two significant bit to notify mce log that this thermal - * event type. - * This is a temp solution. May be changed in the future with mce log - * infrasture. - */ -#define CORE_THROTTLED (0) -#define CORE_POWER_LIMIT ((__u64)1 << 62) -#define PACKAGE_THROTTLED ((__u64)2 << 62) -#define PACKAGE_POWER_LIMIT ((__u64)3 << 62) - static void notify_thresholds(__u64 msr_val) { /* check whether the interrupt handler is defined; @@ -363,27 +352,23 @@ static void intel_thermal_interrupt(void) if (therm_throt_process(msr_val & THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, CORE_LEVEL) != 0) - mce_log_therm_throt_event(CORE_THROTTLED | msr_val); + mce_log_therm_throt_event(msr_val); if (this_cpu_has(X86_FEATURE_PLN)) - if (therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, + therm_throt_process(msr_val & THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, - CORE_LEVEL) != 0) - mce_log_therm_throt_event(CORE_POWER_LIMIT | msr_val); + CORE_LEVEL); if (this_cpu_has(X86_FEATURE_PTS)) { rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); - if (therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, + therm_throt_process(msr_val & PACKAGE_THERM_STATUS_PROCHOT, THERMAL_THROTTLING_EVENT, - PACKAGE_LEVEL) != 0) - mce_log_therm_throt_event(PACKAGE_THROTTLED | msr_val); + PACKAGE_LEVEL); if (this_cpu_has(X86_FEATURE_PLN)) - if (therm_throt_process(msr_val & + therm_throt_process(msr_val & PACKAGE_THERM_STATUS_POWER_LIMIT, POWER_LIMIT_EVENT, - PACKAGE_LEVEL) != 0) - mce_log_therm_throt_event(PACKAGE_POWER_LIMIT - | msr_val); + PACKAGE_LEVEL); } } -- cgit v1.1 From 969df4b82904a30fef19a67398a0c854d223ea67 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 14 Dec 2011 16:12:54 +0100 Subject: x86: Report cpb and eff_freq_ro flags correctly Add the flags to get rid of the [9] and [10] feature names in cpuinfo's 'power management' fields and replace them with meaningful names. Signed-off-by: Joerg Roedel Link: http://lkml.kernel.org/r/1323875574-17881-1-git-send-email-joerg.roedel@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/powerflags.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/powerflags.c b/arch/x86/kernel/cpu/powerflags.c index 5abbea2..7b3fe56 100644 --- a/arch/x86/kernel/cpu/powerflags.c +++ b/arch/x86/kernel/cpu/powerflags.c @@ -16,5 +16,6 @@ const char *const x86_power_flags[32] = { "100mhzsteps", "hwpstate", "", /* tsc invariant mapped to constant_tsc */ - /* nothing */ + "cpb", /* core performance boost */ + "eff_freq_ro", /* Readonly aperf/mperf */ }; -- cgit v1.1 From cb3f718de8301a969f8169d7d4160e73baff0b86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Timo=20Ter=C3=A4s?= Date: Thu, 15 Dec 2011 17:11:28 +0200 Subject: x86, centaur: Enable cx8 for VIA Eden too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit My box with following cpuinfo needs the cx8 enabling still: vendor_id : CentaurHauls cpu family : 6 model : 13 model name : VIA Eden Processor 1200MHz stepping : 0 cpu MHz : 1199.940 cache size : 128 KB This fixes valgrind to work on my box (it requires and checks cx8 from cpuinfo). Signed-off-by: Timo Teräs Link: http://lkml.kernel.org/r/1323961888-10223-1-git-send-email-timo.teras@iki.fi Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/centaur.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e58d978..159103c 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -278,7 +278,7 @@ static void __cpuinit init_c3(struct cpuinfo_x86 *c) } #ifdef CONFIG_X86_32 /* Cyrix III family needs CX8 & PGE explicitly enabled. */ - if (c->x86_model >= 6 && c->x86_model <= 9) { + if (c->x86_model >= 6 && c->x86_model <= 13) { rdmsr(MSR_VIA_FCR, lo, hi); lo |= (1<<1 | 1<<7); wrmsr(MSR_VIA_FCR, lo, hi); -- cgit v1.1 From 2c29d9dd577b74b44e580f957ea44d1df73af23a Mon Sep 17 00:00:00 2001 From: Chen Gong Date: Wed, 7 Dec 2011 09:21:37 -0800 Subject: x86: add IRQ context simulation in module mce-inject mce-inject provides a mechanism to simulate errors so that test scripts can check for correct operation of the kernel without requiring any specialized hardware to create rare events. The existing code can simulate events in normal process context and also in NMI context - but not in IRQ context. This patch fills that gap. Link: https://lkml.org/lkml/2011/12/7/537 Signed-off-by: Chen Gong Signed-off-by: Tony Luck --- arch/x86/kernel/cpu/mcheck/mce-inject.c | 34 +++++++++++++++++++++++++++++---- 1 file changed, 30 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/mcheck/mce-inject.c b/arch/x86/kernel/cpu/mcheck/mce-inject.c index 319882e..fc4beb3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-inject.c +++ b/arch/x86/kernel/cpu/mcheck/mce-inject.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -92,6 +93,18 @@ static int mce_raise_notify(unsigned int cmd, struct pt_regs *regs) return NMI_HANDLED; } +static void mce_irq_ipi(void *info) +{ + int cpu = smp_processor_id(); + struct mce *m = &__get_cpu_var(injectm); + + if (cpumask_test_cpu(cpu, mce_inject_cpumask) && + m->inject_flags & MCJ_EXCEPTION) { + cpumask_clear_cpu(cpu, mce_inject_cpumask); + raise_exception(m, NULL); + } +} + /* Inject mce on current CPU */ static int raise_local(void) { @@ -139,9 +152,10 @@ static void raise_mce(struct mce *m) return; #ifdef CONFIG_X86_LOCAL_APIC - if (m->inject_flags & MCJ_NMI_BROADCAST) { + if (m->inject_flags & (MCJ_IRQ_BRAODCAST | MCJ_NMI_BROADCAST)) { unsigned long start; int cpu; + get_online_cpus(); cpumask_copy(mce_inject_cpumask, cpu_online_mask); cpumask_clear_cpu(get_cpu(), mce_inject_cpumask); @@ -151,13 +165,25 @@ static void raise_mce(struct mce *m) MCJ_CTX(mcpu->inject_flags) != MCJ_CTX_RANDOM) cpumask_clear_cpu(cpu, mce_inject_cpumask); } - if (!cpumask_empty(mce_inject_cpumask)) - apic->send_IPI_mask(mce_inject_cpumask, NMI_VECTOR); + if (!cpumask_empty(mce_inject_cpumask)) { + if (m->inject_flags & MCJ_IRQ_BRAODCAST) { + /* + * don't wait because mce_irq_ipi is necessary + * to be sync with following raise_local + */ + preempt_disable(); + smp_call_function_many(mce_inject_cpumask, + mce_irq_ipi, NULL, 0); + preempt_enable(); + } else if (m->inject_flags & MCJ_NMI_BROADCAST) + apic->send_IPI_mask(mce_inject_cpumask, + NMI_VECTOR); + } start = jiffies; while (!cpumask_empty(mce_inject_cpumask)) { if (!time_before(jiffies, start + 2*HZ)) { printk(KERN_ERR - "Timeout waiting for mce inject NMI %lx\n", + "Timeout waiting for mce inject %lx\n", *cpumask_bits(mce_inject_cpumask)); break; } -- cgit v1.1 From b49d7d877ff96428c8cd2076b33ba72bf85ceaba Mon Sep 17 00:00:00 2001 From: Fernando Luis Vazquez Cao Date: Thu, 15 Dec 2011 11:32:24 +0900 Subject: x86: Convert per-cpu counter icr_read_retry_count into a member of irq_stat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LAPIC related statistics are grouped inside the per-cpu structure irq_stat, so there is no need for icr_read_retry_count to be a standalone per-cpu variable. This patch moves icr_read_retry_count to where it belongs. Suggested-y: Thomas Gleixner Signed-off-by: Fernando Luis Vazquez Cao Cc: Jörn Engel Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 7 +------ arch/x86/kernel/irq.c | 4 ++-- 2 files changed, 3 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2942794..0783236 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -79,11 +79,6 @@ DEFINE_EARLY_PER_CPU(u16, x86_bios_cpu_apicid, BAD_APICID); EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_apicid); EXPORT_EARLY_PER_CPU_SYMBOL(x86_bios_cpu_apicid); -/* - * ICR read retry counter - */ -DEFINE_PER_CPU(unsigned, icr_read_retry_count); - #ifdef CONFIG_X86_32 /* @@ -255,7 +250,7 @@ u32 native_safe_apic_wait_icr_idle(void) send_status = apic_read(APIC_ICR) & APIC_ICR_BUSY; if (!send_status) break; - percpu_inc(icr_read_retry_count); + inc_irq_stat(icr_read_retry_count); udelay(100); } while (timeout++ < 1000); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 4bbf162..ef54ed4 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -76,7 +76,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, " IRQ work interrupts\n"); seq_printf(p, "%*s: ", prec, "RTR"); for_each_online_cpu(j) - seq_printf(p, "%10u ", per_cpu(icr_read_retry_count, j)); + seq_printf(p, "%10u ", irq_stats(j)->icr_read_retry_count); seq_printf(p, " APIC ICR read retries\n"); #endif if (x86_platform_ipi_callback) { @@ -140,7 +140,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->irq_spurious_count; sum += irq_stats(cpu)->apic_perf_irqs; sum += irq_stats(cpu)->apic_irq_work_irqs; - sum += per_cpu(icr_read_retry_count, cpu); + sum += irq_stats(cpu)->icr_read_retry_count; #endif if (x86_platform_ipi_callback) sum += irq_stats(cpu)->x86_platform_ipis; -- cgit v1.1 From 13f541c10b30fc6529200d7f9a0073217709622f Mon Sep 17 00:00:00 2001 From: Clemens Ladisch Date: Mon, 19 Dec 2011 22:07:58 +0100 Subject: x86, dumpstack: Fix code bytes breakage due to missing KERN_CONT When printing the code bytes in show_registers(), the markers around the byte at the fault address could make the printk() format string look like a valid log level and facility code. This would prevent this byte from being printed and result in a spurious newline: [ 7555.765589] Code: 8b 32 e9 94 00 00 00 81 7d 00 ff 00 00 00 0f 87 96 00 00 00 48 8b 83 c0 00 00 00 44 89 e2 44 89 e6 48 89 df 48 8b 80 d8 02 00 00 [ 7555.765683] 8b 48 28 48 89 d0 81 e2 ff 0f 00 00 48 c1 e8 0c 48 c1 e0 04 Add KERN_CONT where needed, and elsewhere in show_registers() for consistency. Signed-off-by: Clemens Ladisch Link: http://lkml.kernel.org/r/4EEFA7AE.9020407@ladisch.de Signed-off-by: H. Peter Anvin --- arch/x86/kernel/dumpstack_32.c | 8 ++++---- arch/x86/kernel/dumpstack_64.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 3b97a80..c99f9ed 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -116,16 +116,16 @@ void show_registers(struct pt_regs *regs) for (i = 0; i < code_len; i++, ip++) { if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - printk(" Bad EIP value."); + printk(KERN_CONT " Bad EIP value."); break; } if (ip == (u8 *)regs->ip) - printk("<%02x> ", c); + printk(KERN_CONT "<%02x> ", c); else - printk("%02x ", c); + printk(KERN_CONT "%02x ", c); } } - printk("\n"); + printk(KERN_CONT "\n"); } int is_valid_bugaddr(unsigned long ip) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index 19853ad..6d728d9 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -284,16 +284,16 @@ void show_registers(struct pt_regs *regs) for (i = 0; i < code_len; i++, ip++) { if (ip < (u8 *)PAGE_OFFSET || probe_kernel_address(ip, c)) { - printk(" Bad RIP value."); + printk(KERN_CONT " Bad RIP value."); break; } if (ip == (u8 *)regs->ip) - printk("<%02x> ", c); + printk(KERN_CONT "<%02x> ", c); else - printk("%02x ", c); + printk(KERN_CONT "%02x ", c); } } - printk("\n"); + printk(KERN_CONT "\n"); } int is_valid_bugaddr(unsigned long ip) -- cgit v1.1 From 141168c36cdee3ff23d9c7700b0edc47cb65479f Mon Sep 17 00:00:00 2001 From: Kevin Winchester Date: Tue, 20 Dec 2011 20:52:22 -0400 Subject: x86: Simplify code by removing a !SMP #ifdefs from 'struct cpuinfo_x86' Several fields in struct cpuinfo_x86 were not defined for the !SMP case, likely to save space. However, those fields still have some meaning for UP, and keeping them allows some #ifdef removal from other files. The additional size of the UP kernel from this change is not significant enough to worry about keeping up the distinction: text data bss dec hex filename 4737168 506459 972040 6215667 5ed7f3 vmlinux.o.before 4737444 506459 972040 6215943 5ed907 vmlinux.o.after for a difference of 276 bytes for an example UP config. If someone wants those 276 bytes back badly then it should be implemented in a cleaner way. Signed-off-by: Kevin Winchester Cc: Steffen Persvold Link: http://lkml.kernel.org/r/1324428742-12498-1-git-send-email-kjwinchester@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/amd_nb.c | 8 ++------ arch/x86/kernel/cpu/amd.c | 2 -- arch/x86/kernel/cpu/common.c | 7 ------- arch/x86/kernel/cpu/intel.c | 2 -- arch/x86/kernel/cpu/mcheck/mce.c | 2 -- arch/x86/kernel/cpu/mcheck/mce_amd.c | 7 +------ arch/x86/kernel/cpu/proc.c | 4 +--- 7 files changed, 4 insertions(+), 28 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 4c39baa..013c181 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -123,16 +123,14 @@ int amd_get_subcaches(int cpu) { struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; unsigned int mask; - int cuid = 0; + int cuid; if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return 0; pci_read_config_dword(link, 0x1d4, &mask); -#ifdef CONFIG_SMP cuid = cpu_data(cpu).compute_unit_id; -#endif return (mask >> (4 * cuid)) & 0xf; } @@ -141,7 +139,7 @@ int amd_set_subcaches(int cpu, int mask) static unsigned int reset, ban; struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); unsigned int reg; - int cuid = 0; + int cuid; if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf) return -EINVAL; @@ -159,9 +157,7 @@ int amd_set_subcaches(int cpu, int mask) pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); } -#ifdef CONFIG_SMP cuid = cpu_data(cpu).compute_unit_id; -#endif mask <<= 4 * cuid; mask |= (0xf ^ (1 << cuid)) << 26; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index ef21bdc..f4773f4 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -148,7 +148,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -192,7 +191,6 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) valid_k7: ; -#endif } static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a70bd5b..850f296 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -676,9 +676,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_early_init) this_cpu->c_early_init(c); -#ifdef CONFIG_SMP c->cpu_index = 0; -#endif filter_cpuid_features(c, false); setup_smep(c); @@ -764,10 +762,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) c->apicid = c->initial_apicid; # endif #endif - -#ifdef CONFIG_X86_HT c->phys_proc_id = c->initial_apicid; -#endif } setup_smep(c); @@ -1146,9 +1141,7 @@ static void dbg_restore_debug_regs(void) */ void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node) { -#ifdef CONFIG_NUMA pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id); -#endif } /* diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 5231312..3e6ff6c 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -181,7 +181,6 @@ static void __cpuinit trap_init_f00f_bug(void) static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -198,7 +197,6 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) WARN_ONCE(1, "WARNING: SMP operation may be unreliable" "with B stepping processors.\n"); } -#endif } static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2af127d..e9c9d0a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -119,9 +119,7 @@ void mce_setup(struct mce *m) m->time = get_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; m->cpuid = cpuid_eax(1); -#ifdef CONFIG_SMP m->socketid = cpu_data(m->extcpu).phys_proc_id; -#endif m->apicid = cpu_data(m->extcpu).initial_apicid; rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f547421..1d76872 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -64,11 +64,9 @@ struct threshold_bank { }; static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); -#ifdef CONFIG_SMP static unsigned char shared_bank[NR_BANKS] = { 0, 0, 0, 0, 1 }; -#endif static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ @@ -202,10 +200,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (!block) per_cpu(bank_map, cpu) |= (1 << bank); -#ifdef CONFIG_SMP if (shared_bank[bank] && c->cpu_core_id) break; -#endif + offset = setup_APIC_mce(offset, (high & MASK_LVTOFF_HI) >> 20); @@ -531,7 +528,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) sprintf(name, "threshold_bank%i", bank); -#ifdef CONFIG_SMP if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ i = cpumask_first(cpu_llc_shared_mask(cpu)); @@ -558,7 +554,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } -#endif b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 14b2314..8022c66 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -64,12 +64,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) static int show_cpuinfo(struct seq_file *m, void *v) { struct cpuinfo_x86 *c = v; - unsigned int cpu = 0; + unsigned int cpu; int i; -#ifdef CONFIG_SMP cpu = c->cpu_index; -#endif seq_printf(m, "processor\t: %u\n" "vendor_id\t: %s\n" "cpu family\t: %d\n" -- cgit v1.1 From cd09c0c40a971549800ce6a7e53c63f5139dd175 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Sun, 11 Dec 2011 00:28:51 +0100 Subject: perf events: Enable raw event support for Intel unhalted_reference_cycles event This patch adds the encoding and definitions necessary for the unhalted_reference_cycles event avaialble since Intel Core 2 processors. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1323559734-3488-2-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 8 +++++++- arch/x86/kernel/cpu/perf_event_intel.c | 15 +++++---------- 2 files changed, 12 insertions(+), 11 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 930fe48..5adce10 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1304,9 +1304,15 @@ static int __init init_hw_perf_events(void) 0, x86_pmu.num_counters, 0); if (x86_pmu.event_constraints) { + /* + * event on fixed counter2 (REF_CYCLES) only works on this + * counter, so do not extend mask to generic counters + */ for_each_event_constraint(c, x86_pmu.event_constraints) { - if (c->cmask != X86_RAW_EVENT_MASK) + if (c->cmask != X86_RAW_EVENT_MASK + || c->idxmsk64 == X86_PMC_MSK_FIXED_REF_CYCLES) { continue; + } c->idxmsk64 |= (1ULL << x86_pmu.num_counters) - 1; c->weight += x86_pmu.num_counters; diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 2c3bf53..61f865f 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -45,12 +45,7 @@ static struct event_constraint intel_core2_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* - * Core2 has Fixed Counter 2 listed as CPU_CLK_UNHALTED.REF and event - * 0x013c as CPU_CLK_UNHALTED.BUS and specifies there is a fixed - * ratio between these counters. - */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x10, 0x1), /* FP_COMP_OPS_EXE */ INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ @@ -68,7 +63,7 @@ static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x40, 0x3), /* L1D_CACHE_LD */ INTEL_EVENT_CONSTRAINT(0x41, 0x3), /* L1D_CACHE_ST */ INTEL_EVENT_CONSTRAINT(0x42, 0x3), /* L1D_CACHE_LOCK */ @@ -90,7 +85,7 @@ static struct event_constraint intel_westmere_event_constraints[] __read_mostly { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x51, 0x3), /* L1D */ INTEL_EVENT_CONSTRAINT(0x60, 0x1), /* OFFCORE_REQUESTS_OUTSTANDING */ INTEL_EVENT_CONSTRAINT(0x63, 0x3), /* CACHE_LOCK_CYCLES */ @@ -102,7 +97,7 @@ static struct event_constraint intel_snb_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ INTEL_EVENT_CONSTRAINT(0x48, 0x4), /* L1D_PEND_MISS.PENDING */ INTEL_UEVENT_CONSTRAINT(0x01c0, 0x2), /* INST_RETIRED.PREC_DIST */ INTEL_EVENT_CONSTRAINT(0xcd, 0x8), /* MEM_TRANS_RETIRED.LOAD_LATENCY */ @@ -125,7 +120,7 @@ static struct event_constraint intel_gen_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ - /* FIXED_EVENT_CONSTRAINT(0x013c, 2), CPU_CLK_UNHALTED.REF */ + FIXED_EVENT_CONSTRAINT(0x0300, 2), /* CPU_CLK_UNHALTED.REF */ EVENT_CONSTRAINT_END }; -- cgit v1.1 From 9c1497ea591b25d491f8e795f90a1405100b75ef Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Sun, 11 Dec 2011 00:28:53 +0100 Subject: perf events: Add Intel x86 mapping for PERF_COUNT_HW_REF_CPU_CYCLES Add event maps for Intel x86 processors (with architected PMU v2 or later). On AMD, there is frequency scaling but no Turbo. There is no core cycle event not subject to frequency scaling, therefore we do not provide a mapping. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1323559734-3488-4-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 61f865f..cbfaaa2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -28,6 +28,7 @@ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c4, [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c5, [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, + [PERF_COUNT_HW_REF_CPU_CYCLES] = 0x0300, /* pseudo-encoding */ }; static struct event_constraint intel_core_event_constraints[] __read_mostly = -- cgit v1.1 From 8a25a2fd126c621f44f3aeaef80d51f00fc11639 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Wed, 21 Dec 2011 14:29:42 -0800 Subject: cpu: convert 'cpu' and 'machinecheck' sysdev_class to a regular subsystem This moves the 'cpu sysdev_class' over to a regular 'cpu' subsystem and converts the devices to regular devices. The sysdev drivers are implemented as subsystem interfaces now. After all sysdev classes are ported to regular driver core entities, the sysdev implementation will be entirely removed from the kernel. Userspace relies on events and generic sysfs subsystem infrastructure from sysdev devices, which are made available with this conversion. Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Tony Luck Cc: Fenghua Yu Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: Paul Mundt Cc: "David S. Miller" Cc: Chris Metcalf Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Borislav Petkov Cc: Tigran Aivazian Cc: Len Brown Cc: Zhang Rui Cc: Dave Jones Cc: Peter Zijlstra Cc: Russell King Cc: Andrew Morton Cc: Arjan van de Ven Cc: "Rafael J. Wysocki" Cc: "Srivatsa S. Bhat" Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/intel_cacheinfo.c | 25 +++--- arch/x86/kernel/cpu/mcheck/mce-internal.h | 4 +- arch/x86/kernel/cpu/mcheck/mce.c | 128 +++++++++++++++--------------- arch/x86/kernel/cpu/mcheck/mce_amd.c | 11 ++- arch/x86/kernel/cpu/mcheck/therm_throt.c | 63 ++++++++------- arch/x86/kernel/microcode_core.c | 58 +++++++------- 6 files changed, 144 insertions(+), 145 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index a3b0811..6b45e5e 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -844,8 +844,7 @@ static int __cpuinit detect_cache_attributes(unsigned int cpu) #include #include - -extern struct sysdev_class cpu_sysdev_class; /* from drivers/base/cpu.c */ +#include /* pointer to kobject for cpuX/cache */ static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject); @@ -1073,9 +1072,9 @@ err_out: static DECLARE_BITMAP(cache_dev_map, NR_CPUS); /* Add/Remove cache interface for CPU device */ -static int __cpuinit cache_add_dev(struct sys_device * sys_dev) +static int __cpuinit cache_add_dev(struct device *dev) { - unsigned int cpu = sys_dev->id; + unsigned int cpu = dev->id; unsigned long i, j; struct _index_kobject *this_object; struct _cpuid4_info *this_leaf; @@ -1087,7 +1086,7 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu), &ktype_percpu_entry, - &sys_dev->kobj, "%s", "cache"); + &dev->kobj, "%s", "cache"); if (retval < 0) { cpuid4_cache_sysfs_exit(cpu); return retval; @@ -1124,9 +1123,9 @@ static int __cpuinit cache_add_dev(struct sys_device * sys_dev) return 0; } -static void __cpuinit cache_remove_dev(struct sys_device * sys_dev) +static void __cpuinit cache_remove_dev(struct device *dev) { - unsigned int cpu = sys_dev->id; + unsigned int cpu = dev->id; unsigned long i; if (per_cpu(ici_cpuid4_info, cpu) == NULL) @@ -1145,17 +1144,17 @@ static int __cpuinit cacheinfo_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; + struct device *dev; - sys_dev = get_cpu_sysdev(cpu); + dev = get_cpu_device(cpu); switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - cache_add_dev(sys_dev); + cache_add_dev(dev); break; case CPU_DEAD: case CPU_DEAD_FROZEN: - cache_remove_dev(sys_dev); + cache_remove_dev(dev); break; } return NOTIFY_OK; @@ -1174,9 +1173,9 @@ static int __cpuinit cache_sysfs_init(void) for_each_online_cpu(i) { int err; - struct sys_device *sys_dev = get_cpu_sysdev(i); + struct device *dev = get_cpu_device(i); - err = cache_add_dev(sys_dev); + err = cache_add_dev(dev); if (err) return err; } diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index fefcc69..ed44c8a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -1,4 +1,4 @@ -#include +#include #include enum severity_level { @@ -17,7 +17,7 @@ enum severity_level { struct mce_bank { u64 ctl; /* subevents to enable */ unsigned char init; /* initialise bank? */ - struct sysdev_attribute attr; /* sysdev attribute */ + struct device_attribute attr; /* device attribute */ char attrname[ATTR_LEN]; /* attribute name */ }; diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 362056a..0156c6f 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -19,7 +19,7 @@ #include #include #include -#include +#include #include #include #include @@ -1751,7 +1751,7 @@ static struct syscore_ops mce_syscore_ops = { }; /* - * mce_sysdev: Sysfs support + * mce_device: Sysfs support */ static void mce_cpu_restart(void *data) @@ -1787,27 +1787,28 @@ static void mce_enable_ce(void *all) __mcheck_cpu_init_timer(); } -static struct sysdev_class mce_sysdev_class = { +static struct bus_type mce_subsys = { .name = "machinecheck", + .dev_name = "machinecheck", }; -DEFINE_PER_CPU(struct sys_device, mce_sysdev); +DEFINE_PER_CPU(struct device, mce_device); __cpuinitdata void (*threshold_cpu_callback)(unsigned long action, unsigned int cpu); -static inline struct mce_bank *attr_to_bank(struct sysdev_attribute *attr) +static inline struct mce_bank *attr_to_bank(struct device_attribute *attr) { return container_of(attr, struct mce_bank, attr); } -static ssize_t show_bank(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t show_bank(struct device *s, struct device_attribute *attr, char *buf) { return sprintf(buf, "%llx\n", attr_to_bank(attr)->ctl); } -static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t set_bank(struct device *s, struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1822,14 +1823,14 @@ static ssize_t set_bank(struct sys_device *s, struct sysdev_attribute *attr, } static ssize_t -show_trigger(struct sys_device *s, struct sysdev_attribute *attr, char *buf) +show_trigger(struct device *s, struct device_attribute *attr, char *buf) { strcpy(buf, mce_helper); strcat(buf, "\n"); return strlen(mce_helper) + 1; } -static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, +static ssize_t set_trigger(struct device *s, struct device_attribute *attr, const char *buf, size_t siz) { char *p; @@ -1844,8 +1845,8 @@ static ssize_t set_trigger(struct sys_device *s, struct sysdev_attribute *attr, return strlen(mce_helper) + !!p; } -static ssize_t set_ignore_ce(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t set_ignore_ce(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1868,8 +1869,8 @@ static ssize_t set_ignore_ce(struct sys_device *s, return size; } -static ssize_t set_cmci_disabled(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t set_cmci_disabled(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { u64 new; @@ -1891,108 +1892,107 @@ static ssize_t set_cmci_disabled(struct sys_device *s, return size; } -static ssize_t store_int_with_restart(struct sys_device *s, - struct sysdev_attribute *attr, +static ssize_t store_int_with_restart(struct device *s, + struct device_attribute *attr, const char *buf, size_t size) { - ssize_t ret = sysdev_store_int(s, attr, buf, size); + ssize_t ret = device_store_int(s, attr, buf, size); mce_restart(); return ret; } -static SYSDEV_ATTR(trigger, 0644, show_trigger, set_trigger); -static SYSDEV_INT_ATTR(tolerant, 0644, tolerant); -static SYSDEV_INT_ATTR(monarch_timeout, 0644, monarch_timeout); -static SYSDEV_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); +static DEVICE_ATTR(trigger, 0644, show_trigger, set_trigger); +static DEVICE_INT_ATTR(tolerant, 0644, tolerant); +static DEVICE_INT_ATTR(monarch_timeout, 0644, monarch_timeout); +static DEVICE_INT_ATTR(dont_log_ce, 0644, mce_dont_log_ce); -static struct sysdev_ext_attribute attr_check_interval = { - _SYSDEV_ATTR(check_interval, 0644, sysdev_show_int, - store_int_with_restart), +static struct dev_ext_attribute dev_attr_check_interval = { + __ATTR(check_interval, 0644, device_show_int, store_int_with_restart), &check_interval }; -static struct sysdev_ext_attribute attr_ignore_ce = { - _SYSDEV_ATTR(ignore_ce, 0644, sysdev_show_int, set_ignore_ce), +static struct dev_ext_attribute dev_attr_ignore_ce = { + __ATTR(ignore_ce, 0644, device_show_int, set_ignore_ce), &mce_ignore_ce }; -static struct sysdev_ext_attribute attr_cmci_disabled = { - _SYSDEV_ATTR(cmci_disabled, 0644, sysdev_show_int, set_cmci_disabled), +static struct dev_ext_attribute dev_attr_cmci_disabled = { + __ATTR(cmci_disabled, 0644, device_show_int, set_cmci_disabled), &mce_cmci_disabled }; -static struct sysdev_attribute *mce_sysdev_attrs[] = { - &attr_tolerant.attr, - &attr_check_interval.attr, - &attr_trigger, - &attr_monarch_timeout.attr, - &attr_dont_log_ce.attr, - &attr_ignore_ce.attr, - &attr_cmci_disabled.attr, +static struct device_attribute *mce_device_attrs[] = { + &dev_attr_tolerant.attr, + &dev_attr_check_interval.attr, + &dev_attr_trigger, + &dev_attr_monarch_timeout.attr, + &dev_attr_dont_log_ce.attr, + &dev_attr_ignore_ce.attr, + &dev_attr_cmci_disabled.attr, NULL }; -static cpumask_var_t mce_sysdev_initialized; +static cpumask_var_t mce_device_initialized; -/* Per cpu sysdev init. All of the cpus still share the same ctrl bank: */ -static __cpuinit int mce_sysdev_create(unsigned int cpu) +/* Per cpu device init. All of the cpus still share the same ctrl bank: */ +static __cpuinit int mce_device_create(unsigned int cpu) { - struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); + struct device *dev = &per_cpu(mce_device, cpu); int err; int i, j; if (!mce_available(&boot_cpu_data)) return -EIO; - memset(&sysdev->kobj, 0, sizeof(struct kobject)); - sysdev->id = cpu; - sysdev->cls = &mce_sysdev_class; + memset(&dev->kobj, 0, sizeof(struct kobject)); + dev->id = cpu; + dev->bus = &mce_subsys; - err = sysdev_register(sysdev); + err = device_register(dev); if (err) return err; - for (i = 0; mce_sysdev_attrs[i]; i++) { - err = sysdev_create_file(sysdev, mce_sysdev_attrs[i]); + for (i = 0; mce_device_attrs[i]; i++) { + err = device_create_file(dev, mce_device_attrs[i]); if (err) goto error; } for (j = 0; j < banks; j++) { - err = sysdev_create_file(sysdev, &mce_banks[j].attr); + err = device_create_file(dev, &mce_banks[j].attr); if (err) goto error2; } - cpumask_set_cpu(cpu, mce_sysdev_initialized); + cpumask_set_cpu(cpu, mce_device_initialized); return 0; error2: while (--j >= 0) - sysdev_remove_file(sysdev, &mce_banks[j].attr); + device_remove_file(dev, &mce_banks[j].attr); error: while (--i >= 0) - sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); + device_remove_file(dev, mce_device_attrs[i]); - sysdev_unregister(sysdev); + device_unregister(dev); return err; } -static __cpuinit void mce_sysdev_remove(unsigned int cpu) +static __cpuinit void mce_device_remove(unsigned int cpu) { - struct sys_device *sysdev = &per_cpu(mce_sysdev, cpu); + struct device *dev = &per_cpu(mce_device, cpu); int i; - if (!cpumask_test_cpu(cpu, mce_sysdev_initialized)) + if (!cpumask_test_cpu(cpu, mce_device_initialized)) return; - for (i = 0; mce_sysdev_attrs[i]; i++) - sysdev_remove_file(sysdev, mce_sysdev_attrs[i]); + for (i = 0; mce_device_attrs[i]; i++) + device_remove_file(dev, mce_device_attrs[i]); for (i = 0; i < banks; i++) - sysdev_remove_file(sysdev, &mce_banks[i].attr); + device_remove_file(dev, &mce_banks[i].attr); - sysdev_unregister(sysdev); - cpumask_clear_cpu(cpu, mce_sysdev_initialized); + device_unregister(dev); + cpumask_clear_cpu(cpu, mce_device_initialized); } /* Make sure there are no machine checks on offlined CPUs. */ @@ -2042,7 +2042,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - mce_sysdev_create(cpu); + mce_device_create(cpu); if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); break; @@ -2050,7 +2050,7 @@ mce_cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu) case CPU_DEAD_FROZEN: if (threshold_cpu_callback) threshold_cpu_callback(action, cpu); - mce_sysdev_remove(cpu); + mce_device_remove(cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: @@ -2084,7 +2084,7 @@ static __init void mce_init_banks(void) for (i = 0; i < banks; i++) { struct mce_bank *b = &mce_banks[i]; - struct sysdev_attribute *a = &b->attr; + struct device_attribute *a = &b->attr; sysfs_attr_init(&a->attr); a->attr.name = b->attrname; @@ -2104,16 +2104,16 @@ static __init int mcheck_init_device(void) if (!mce_available(&boot_cpu_data)) return -EIO; - zalloc_cpumask_var(&mce_sysdev_initialized, GFP_KERNEL); + zalloc_cpumask_var(&mce_device_initialized, GFP_KERNEL); mce_init_banks(); - err = sysdev_class_register(&mce_sysdev_class); + err = subsys_system_register(&mce_subsys, NULL); if (err) return err; for_each_online_cpu(i) { - err = mce_sysdev_create(i); + err = mce_device_create(i); if (err) return err; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f547421..56d2aa1 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include @@ -548,7 +547,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (!b) goto out; - err = sysfs_create_link(&per_cpu(mce_sysdev, cpu).kobj, + err = sysfs_create_link(&per_cpu(mce_device, cpu).kobj, b->kobj, name); if (err) goto out; @@ -571,7 +570,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } - b->kobj = kobject_create_and_add(name, &per_cpu(mce_sysdev, cpu).kobj); + b->kobj = kobject_create_and_add(name, &per_cpu(mce_device, cpu).kobj); if (!b->kobj) goto out_free; @@ -591,7 +590,7 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) if (i == cpu) continue; - err = sysfs_create_link(&per_cpu(mce_sysdev, i).kobj, + err = sysfs_create_link(&per_cpu(mce_device, i).kobj, b->kobj, name); if (err) goto out; @@ -669,7 +668,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) #ifdef CONFIG_SMP /* sibling symlink */ if (shared_bank[bank] && b->blocks->cpu != cpu) { - sysfs_remove_link(&per_cpu(mce_sysdev, cpu).kobj, name); + sysfs_remove_link(&per_cpu(mce_device, cpu).kobj, name); per_cpu(threshold_banks, cpu)[bank] = NULL; return; @@ -681,7 +680,7 @@ static void threshold_remove_bank(unsigned int cpu, int bank) if (i == cpu) continue; - sysfs_remove_link(&per_cpu(mce_sysdev, i).kobj, name); + sysfs_remove_link(&per_cpu(mce_device, i).kobj, name); per_cpu(threshold_banks, i)[bank] = NULL; } diff --git a/arch/x86/kernel/cpu/mcheck/therm_throt.c b/arch/x86/kernel/cpu/mcheck/therm_throt.c index 787e06c..59e3f6e 100644 --- a/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ b/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -69,16 +68,16 @@ static atomic_t therm_throt_en = ATOMIC_INIT(0); static u32 lvtthmr_init __read_mostly; #ifdef CONFIG_SYSFS -#define define_therm_throt_sysdev_one_ro(_name) \ - static SYSDEV_ATTR(_name, 0444, \ - therm_throt_sysdev_show_##_name, \ +#define define_therm_throt_device_one_ro(_name) \ + static DEVICE_ATTR(_name, 0444, \ + therm_throt_device_show_##_name, \ NULL) \ -#define define_therm_throt_sysdev_show_func(event, name) \ +#define define_therm_throt_device_show_func(event, name) \ \ -static ssize_t therm_throt_sysdev_show_##event##_##name( \ - struct sys_device *dev, \ - struct sysdev_attribute *attr, \ +static ssize_t therm_throt_device_show_##event##_##name( \ + struct device *dev, \ + struct device_attribute *attr, \ char *buf) \ { \ unsigned int cpu = dev->id; \ @@ -95,20 +94,20 @@ static ssize_t therm_throt_sysdev_show_##event##_##name( \ return ret; \ } -define_therm_throt_sysdev_show_func(core_throttle, count); -define_therm_throt_sysdev_one_ro(core_throttle_count); +define_therm_throt_device_show_func(core_throttle, count); +define_therm_throt_device_one_ro(core_throttle_count); -define_therm_throt_sysdev_show_func(core_power_limit, count); -define_therm_throt_sysdev_one_ro(core_power_limit_count); +define_therm_throt_device_show_func(core_power_limit, count); +define_therm_throt_device_one_ro(core_power_limit_count); -define_therm_throt_sysdev_show_func(package_throttle, count); -define_therm_throt_sysdev_one_ro(package_throttle_count); +define_therm_throt_device_show_func(package_throttle, count); +define_therm_throt_device_one_ro(package_throttle_count); -define_therm_throt_sysdev_show_func(package_power_limit, count); -define_therm_throt_sysdev_one_ro(package_power_limit_count); +define_therm_throt_device_show_func(package_power_limit, count); +define_therm_throt_device_one_ro(package_power_limit_count); static struct attribute *thermal_throttle_attrs[] = { - &attr_core_throttle_count.attr, + &dev_attr_core_throttle_count.attr, NULL }; @@ -223,36 +222,36 @@ static int thresh_event_valid(int event) #ifdef CONFIG_SYSFS /* Add/Remove thermal_throttle interface for CPU device: */ -static __cpuinit int thermal_throttle_add_dev(struct sys_device *sys_dev, +static __cpuinit int thermal_throttle_add_dev(struct device *dev, unsigned int cpu) { int err; struct cpuinfo_x86 *c = &cpu_data(cpu); - err = sysfs_create_group(&sys_dev->kobj, &thermal_attr_group); + err = sysfs_create_group(&dev->kobj, &thermal_attr_group); if (err) return err; if (cpu_has(c, X86_FEATURE_PLN)) - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_core_power_limit_count.attr, + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_core_power_limit_count.attr, thermal_attr_group.name); if (cpu_has(c, X86_FEATURE_PTS)) { - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_package_throttle_count.attr, + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_throttle_count.attr, thermal_attr_group.name); if (cpu_has(c, X86_FEATURE_PLN)) - err = sysfs_add_file_to_group(&sys_dev->kobj, - &attr_package_power_limit_count.attr, + err = sysfs_add_file_to_group(&dev->kobj, + &dev_attr_package_power_limit_count.attr, thermal_attr_group.name); } return err; } -static __cpuinit void thermal_throttle_remove_dev(struct sys_device *sys_dev) +static __cpuinit void thermal_throttle_remove_dev(struct device *dev) { - sysfs_remove_group(&sys_dev->kobj, &thermal_attr_group); + sysfs_remove_group(&dev->kobj, &thermal_attr_group); } /* Mutex protecting device creation against CPU hotplug: */ @@ -265,16 +264,16 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; + struct device *dev; int err = 0; - sys_dev = get_cpu_sysdev(cpu); + dev = get_cpu_device(cpu); switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: mutex_lock(&therm_cpu_lock); - err = thermal_throttle_add_dev(sys_dev, cpu); + err = thermal_throttle_add_dev(dev, cpu); mutex_unlock(&therm_cpu_lock); WARN_ON(err); break; @@ -283,7 +282,7 @@ thermal_throttle_cpu_callback(struct notifier_block *nfb, case CPU_DEAD: case CPU_DEAD_FROZEN: mutex_lock(&therm_cpu_lock); - thermal_throttle_remove_dev(sys_dev); + thermal_throttle_remove_dev(dev); mutex_unlock(&therm_cpu_lock); break; } @@ -310,7 +309,7 @@ static __init int thermal_throttle_init_device(void) #endif /* connect live CPUs to sysfs */ for_each_online_cpu(cpu) { - err = thermal_throttle_add_dev(get_cpu_sysdev(cpu), cpu); + err = thermal_throttle_add_dev(get_cpu_device(cpu), cpu); WARN_ON(err); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index f2d2a66..cf88f2a 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -292,8 +292,8 @@ static int reload_for_cpu(int cpu) return err; } -static ssize_t reload_store(struct sys_device *dev, - struct sysdev_attribute *attr, +static ssize_t reload_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t size) { unsigned long val; @@ -318,30 +318,30 @@ static ssize_t reload_store(struct sys_device *dev, return ret; } -static ssize_t version_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) +static ssize_t version_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; return sprintf(buf, "0x%x\n", uci->cpu_sig.rev); } -static ssize_t pf_show(struct sys_device *dev, - struct sysdev_attribute *attr, char *buf) +static ssize_t pf_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct ucode_cpu_info *uci = ucode_cpu_info + dev->id; return sprintf(buf, "0x%x\n", uci->cpu_sig.pf); } -static SYSDEV_ATTR(reload, 0200, NULL, reload_store); -static SYSDEV_ATTR(version, 0400, version_show, NULL); -static SYSDEV_ATTR(processor_flags, 0400, pf_show, NULL); +static DEVICE_ATTR(reload, 0200, NULL, reload_store); +static DEVICE_ATTR(version, 0400, version_show, NULL); +static DEVICE_ATTR(processor_flags, 0400, pf_show, NULL); static struct attribute *mc_default_attrs[] = { - &attr_reload.attr, - &attr_version.attr, - &attr_processor_flags.attr, + &dev_attr_reload.attr, + &dev_attr_version.attr, + &dev_attr_processor_flags.attr, NULL }; @@ -405,43 +405,45 @@ static enum ucode_state microcode_update_cpu(int cpu) return ustate; } -static int mc_sysdev_add(struct sys_device *sys_dev) +static int mc_device_add(struct device *dev, struct subsys_interface *sif) { - int err, cpu = sys_dev->id; + int err, cpu = dev->id; if (!cpu_online(cpu)) return 0; pr_debug("CPU%d added\n", cpu); - err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); + err = sysfs_create_group(&dev->kobj, &mc_attr_group); if (err) return err; if (microcode_init_cpu(cpu) == UCODE_ERROR) { - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + sysfs_remove_group(&dev->kobj, &mc_attr_group); return -EINVAL; } return err; } -static int mc_sysdev_remove(struct sys_device *sys_dev) +static int mc_device_remove(struct device *dev, struct subsys_interface *sif) { - int cpu = sys_dev->id; + int cpu = dev->id; if (!cpu_online(cpu)) return 0; pr_debug("CPU%d removed\n", cpu); microcode_fini_cpu(cpu); - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + sysfs_remove_group(&dev->kobj, &mc_attr_group); return 0; } -static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, +static struct subsys_interface mc_cpu_interface = { + .name = "microcode", + .subsys = &cpu_subsys, + .add_dev = mc_device_add, + .remove_dev = mc_device_remove, }; /** @@ -464,9 +466,9 @@ static __cpuinit int mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; - struct sys_device *sys_dev; + struct device *dev; - sys_dev = get_cpu_sysdev(cpu); + dev = get_cpu_device(cpu); switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: @@ -474,13 +476,13 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: pr_debug("CPU%d added\n", cpu); - if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) + if (sysfs_create_group(&dev->kobj, &mc_attr_group)) pr_err("Failed to create group for CPU%d\n", cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: /* Suspend is in progress, only remove the interface */ - sysfs_remove_group(&sys_dev->kobj, &mc_attr_group); + sysfs_remove_group(&dev->kobj, &mc_attr_group); pr_debug("CPU%d removed\n", cpu); break; @@ -527,7 +529,7 @@ static int __init microcode_init(void) get_online_cpus(); mutex_lock(µcode_mutex); - error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + error = subsys_interface_register(&mc_cpu_interface); mutex_unlock(µcode_mutex); put_online_cpus(); @@ -561,7 +563,7 @@ static void __exit microcode_exit(void) get_online_cpus(); mutex_lock(µcode_mutex); - sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + subsys_interface_unregister(&mc_cpu_interface); mutex_unlock(µcode_mutex); put_online_cpus(); -- cgit v1.1 From edbaa603eb801655e80808a9cf3d3b622e8ac66b Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Wed, 21 Dec 2011 16:26:03 -0800 Subject: driver-core: remove sysdev.h usage. The sysdev.h file should not be needed by any in-kernel code, so remove the .h file from these random files that seem to still want to include it. The sysdev code will be going away soon, so this include needs to be removed no matter what. Cc: Jiandong Zheng Cc: Scott Branden Cc: Russell King Cc: Kukjin Kim Cc: David Brown Cc: Daniel Walker Cc: Bryan Huntsman Cc: Ben Dooks Cc: Wan ZongShun Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Guan Xuetao Cc: "Venkatesh Pallipadi Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Grant Likely Cc: Richard Purdie Cc: Matthew Garrett Signed-off-by: Kay Sievers --- arch/x86/kernel/hpet.c | 1 - arch/x86/kernel/irqinit.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index b946a9e..98094a9 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -2,7 +2,6 @@ #include #include #include -#include #include #include #include diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index b3300e6..313fb5c 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include #include #include -- cgit v1.1 From 2e64694de21a812d637dcbea4471ad1f7897b049 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Fri, 23 Dec 2011 14:24:25 +0100 Subject: perf/x86: Fix raw_spin_unlock_irqrestore() usage Use raw_spin_unlock_irqrestore() as equivalent to raw_spin_lock_irqsave(). Signed-off-by: Robert Richter Cc: Stephane Eranian Cc: Peter Zijlstra Link: http://lkml.kernel.org/r/1324646665-13334-1-git-send-email-robert.richter@amd.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 8d601b1..121f1be 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1169,7 +1169,7 @@ again: */ c = &unconstrained; } else if (intel_try_alt_er(event, orig_idx)) { - raw_spin_unlock(&era->lock); + raw_spin_unlock_irqrestore(&era->lock, flags); goto again; } raw_spin_unlock_irqrestore(&era->lock, flags); -- cgit v1.1 From e8524b2f43ab6747518aef81c709d104c478b1cd Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 21 Dec 2011 17:45:15 -0800 Subject: x86, apic: Add probe() for apic_flat Currently we start with the default apic_flat mode and switch to some other apic model depending on the apic drivers acpi_madt_oem_check() routines and later followed by the apic drivers probe() routines. Once we selected non flat mode there was no case where we fall back to flat mode again. Upcoming changes allow bios-enabled x2apic mode to be disabled by the OS if interrupt-remapping etc is not setup properly by the bios. We now has a case for the apic to fall back to legacy flat mode during apic driver probe() seqeuence. Add a simple flat_probe() which allows the apic_flat mode to be the last fallback option. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/20111222014632.484984298@sbsiddha-desk.sc.intel.com Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/apic_flat_64.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic_flat_64.c b/arch/x86/kernel/apic/apic_flat_64.c index 57c1f41..8c3cdde 100644 --- a/arch/x86/kernel/apic/apic_flat_64.c +++ b/arch/x86/kernel/apic/apic_flat_64.c @@ -171,9 +171,14 @@ static int flat_phys_pkg_id(int initial_apic_id, int index_msb) return initial_apic_id >> index_msb; } +static int flat_probe(void) +{ + return 1; +} + static struct apic apic_flat = { .name = "flat", - .probe = NULL, + .probe = flat_probe, .acpi_madt_oem_check = flat_acpi_madt_oem_check, .apic_id_registered = flat_apic_id_registered, -- cgit v1.1 From a35fd28256e7736cc84af8931a16224f0bfaaf6c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 21 Dec 2011 17:45:16 -0800 Subject: x86, acpi: Skip acpi x2apic entries if the x2apic feature is not present If the x2apic feature is not present (either the cpu is not capable of it or the user has disabled the feature using boot-parameter etc), ignore the x2apic MADT and SRAT entries provided by the ACPI tables. Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/20111222014632.540896503@sbsiddha-desk.sc.intel.com Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/acpi/boot.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 4558f0d..ce664f3 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -219,6 +219,8 @@ static int __init acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) { struct acpi_madt_local_x2apic *processor = NULL; + int apic_id; + u8 enabled; processor = (struct acpi_madt_local_x2apic *)header; @@ -227,6 +229,8 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) acpi_table_print_madt_entry(header); + apic_id = processor->local_apic_id; + enabled = processor->lapic_flags & ACPI_MADT_ENABLED; #ifdef CONFIG_X86_X2APIC /* * We need to register disabled CPU as well to permit @@ -235,8 +239,10 @@ acpi_parse_x2apic(struct acpi_subtable_header *header, const unsigned long end) * to not preallocating memory for all NR_CPUS * when we use CPU hotplug. */ - acpi_register_lapic(processor->local_apic_id, /* APIC ID */ - processor->lapic_flags & ACPI_MADT_ENABLED); + if (!cpu_has_x2apic && (apic_id >= 0xff) && enabled) + printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); + else + acpi_register_lapic(apic_id, enabled); #else printk(KERN_WARNING PREFIX "x2apic entry ignored\n"); #endif -- cgit v1.1 From fb209bd891645bb87b9618b724f0b4928e0df3de Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 21 Dec 2011 17:45:17 -0800 Subject: x86, x2apic: Fallback to xapic when BIOS doesn't setup interrupt-remapping On some of the recent Intel SNB platforms, by default bios is pre-enabling x2apic mode in the cpu with out setting up interrupt-remapping. This case was resulting in the kernel to panic as the cpu is already in x2apic mode but the OS was not able to enable interrupt-remapping (which is a pre-req for using x2apic capability). On these platforms all the apic-ids are < 255 and the kernel can fallback to xapic mode if the bios has not enabled interrupt-remapping (which is mostly the case if the bios has not exported interrupt-remapping tables to the OS). Reported-by: Berck E. Nash Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/20111222014632.600418637@sbsiddha-desk.sc.intel.com Signed-off-by: Suresh Siddha Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/apic.c | 73 +++++++++++++++++++++++++++++++----------- arch/x86/kernel/apic/io_apic.c | 4 +++ 2 files changed, 59 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 0783236..2c07aeb 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -146,7 +146,8 @@ __setup("apicpmtimer", setup_apicpmtimer); int x2apic_mode; #ifdef CONFIG_X86_X2APIC /* x2apic enabled before OS handover */ -static int x2apic_preenabled; +int x2apic_preenabled; +static int x2apic_disabled; static __init int setup_nox2apic(char *str) { if (x2apic_enabled()) { @@ -1432,6 +1433,40 @@ void __init bsp_end_local_APIC_setup(void) } #ifdef CONFIG_X86_X2APIC +/* + * Need to disable xapic and x2apic at the same time and then enable xapic mode + */ +static inline void __disable_x2apic(u64 msr) +{ + wrmsrl(MSR_IA32_APICBASE, + msr & ~(X2APIC_ENABLE | XAPIC_ENABLE)); + wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE); +} + +static void disable_x2apic(void) +{ + u64 msr; + + if (!cpu_has_x2apic) + return; + + rdmsrl(MSR_IA32_APICBASE, msr); + if (msr & X2APIC_ENABLE) { + u32 x2apic_id = read_apic_id(); + + if (x2apic_id >= 255) + panic("Cannot disable x2apic, id: %08x\n", x2apic_id); + + pr_info("Disabling x2apic\n"); + __disable_x2apic(msr); + + x2apic_disabled = 1; + x2apic_mode = 0; + + register_lapic_address(mp_lapic_addr); + } +} + void check_x2apic(void) { if (x2apic_enabled()) { @@ -1442,15 +1477,20 @@ void check_x2apic(void) void enable_x2apic(void) { - int msr, msr2; + u64 msr; + + rdmsrl(MSR_IA32_APICBASE, msr); + if (x2apic_disabled) { + __disable_x2apic(msr); + return; + } if (!x2apic_mode) return; - rdmsr(MSR_IA32_APICBASE, msr, msr2); if (!(msr & X2APIC_ENABLE)) { printk_once(KERN_INFO "Enabling x2apic\n"); - wrmsr(MSR_IA32_APICBASE, msr | X2APIC_ENABLE, msr2); + wrmsrl(MSR_IA32_APICBASE, msr | X2APIC_ENABLE); } } #endif /* CONFIG_X86_X2APIC */ @@ -1487,7 +1527,7 @@ void __init enable_IR_x2apic(void) ret = save_ioapic_entries(); if (ret) { pr_info("Saving IO-APIC state failed: %d\n", ret); - goto out; + return; } local_irq_save(flags); @@ -1499,13 +1539,19 @@ void __init enable_IR_x2apic(void) else ret = enable_IR(); + if (!x2apic_supported()) + goto nox2apic; + if (ret < 0) { /* IR is required if there is APIC ID > 255 even when running * under KVM */ if (max_physical_apicid > 255 || - !hypervisor_x2apic_available()) + !hypervisor_x2apic_available()) { + if (x2apic_preenabled) + disable_x2apic(); goto nox2apic; + } /* * without IR all CPUs can be addressed by IOAPIC/MSI * only in physical mode @@ -1513,8 +1559,10 @@ void __init enable_IR_x2apic(void) x2apic_force_phys(); } - if (ret == IRQ_REMAP_XAPIC_MODE) + if (ret == IRQ_REMAP_XAPIC_MODE) { + pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n"); goto nox2apic; + } x2apic_enabled = 1; @@ -1529,17 +1577,6 @@ nox2apic: restore_ioapic_entries(); legacy_pic->restore_mask(); local_irq_restore(flags); - -out: - if (x2apic_enabled || !x2apic_supported()) - return; - - if (x2apic_preenabled) - panic("x2apic: enabled by BIOS but kernel init failed."); - else if (ret == IRQ_REMAP_XAPIC_MODE) - pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n"); - else if (ret < 0) - pr_info("x2apic not enabled, IRQ remapping init failed\n"); } #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 6d939d7..45b461f 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2948,6 +2948,10 @@ static inline void __init check_timer(void) } local_irq_disable(); apic_printk(APIC_QUIET, KERN_INFO "..... failed :(.\n"); + if (x2apic_preenabled) + apic_printk(APIC_QUIET, KERN_INFO + "Perhaps problem with the pre-enabled x2apic mode\n" + "Try booting with x2apic and interrupt-remapping disabled in the bios.\n"); panic("IO-APIC + timer doesn't work! Boot with apic=debug and send a " "report. Then try booting with the 'noapic' option.\n"); out: -- cgit v1.1 From a31bc32760992a2c68f3d6bf7da9f760c0fd7c41 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 23 Dec 2011 11:01:43 -0800 Subject: x86, x2apic: Allow "nox2apic" to disable x2apic mode setup by BIOS Currently "nox2apic" boot parameter was not enabling x2apic mode if the cpu, kernel are all capable of enabling x2apic mode and the OS handover happened in xapic mode. However If the bios enabled x2apic prior to OS handover, using "nox2apic" boot parameter had no effect. If the boot cpu's apicid is < 255, enable "nox2apic" boot parameter to disable the x2apic mode setup by the bios. This will enable the kernel to fallback to xapic mode and bringup only the cpu's which has apic-id < 255. -v2: fix patch error and two compiling warning make disable_x2apic to be __init Signed-off-by: Yinghai Lu Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/CAE9FiQUeB-3uxJAMiHsz=uPWoFv5Hg1pVepz7aU6YtqOxMC-=Q@mail.gmail.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/apic/apic.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 2c07aeb..ff69d5d 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -148,15 +148,24 @@ int x2apic_mode; /* x2apic enabled before OS handover */ int x2apic_preenabled; static int x2apic_disabled; +static int nox2apic; static __init int setup_nox2apic(char *str) { if (x2apic_enabled()) { - pr_warning("Bios already enabled x2apic, " - "can't enforce nox2apic"); - return 0; - } + int apicid = native_apic_msr_read(APIC_ID); + + if (apicid >= 255) { + pr_warning("Apicid: %08x, cannot enforce nox2apic\n", + apicid); + return 0; + } + + pr_warning("x2apic already enabled. will disable it\n"); + } else + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + + nox2apic = 1; - setup_clear_cpu_cap(X86_FEATURE_X2APIC); return 0; } early_param("nox2apic", setup_nox2apic); @@ -1443,7 +1452,7 @@ static inline void __disable_x2apic(u64 msr) wrmsrl(MSR_IA32_APICBASE, msr & ~X2APIC_ENABLE); } -static void disable_x2apic(void) +static __init void disable_x2apic(void) { u64 msr; @@ -1460,6 +1469,11 @@ static void disable_x2apic(void) pr_info("Disabling x2apic\n"); __disable_x2apic(msr); + if (nox2apic) { + clear_cpu_cap(&cpu_data(0), X86_FEATURE_X2APIC); + setup_clear_cpu_cap(X86_FEATURE_X2APIC); + } + x2apic_disabled = 1; x2apic_mode = 0; @@ -1534,13 +1548,16 @@ void __init enable_IR_x2apic(void) legacy_pic->mask_all(); mask_ioapic_entries(); + if (x2apic_preenabled && nox2apic) + disable_x2apic(); + if (dmar_table_init_ret) ret = -1; else ret = enable_IR(); if (!x2apic_supported()) - goto nox2apic; + goto skip_x2apic; if (ret < 0) { /* IR is required if there is APIC ID > 255 even when running @@ -1550,7 +1567,7 @@ void __init enable_IR_x2apic(void) !hypervisor_x2apic_available()) { if (x2apic_preenabled) disable_x2apic(); - goto nox2apic; + goto skip_x2apic; } /* * without IR all CPUs can be addressed by IOAPIC/MSI @@ -1561,7 +1578,7 @@ void __init enable_IR_x2apic(void) if (ret == IRQ_REMAP_XAPIC_MODE) { pr_info("x2apic not enabled, IRQ remapping is in xapic mode\n"); - goto nox2apic; + goto skip_x2apic; } x2apic_enabled = 1; @@ -1572,7 +1589,7 @@ void __init enable_IR_x2apic(void) pr_info("Enabled x2apic\n"); } -nox2apic: +skip_x2apic: if (ret < 0) /* IR enabling failed */ restore_ioapic_entries(); legacy_pic->restore_mask(); -- cgit v1.1 From c284b42abadbb22083bfde24d308899c08d44ffa Mon Sep 17 00:00:00 2001 From: Suresh Siddha Date: Wed, 21 Dec 2011 17:45:19 -0800 Subject: x86: Skip cpus with apic-ids >= 255 in !x2apic_mode If the x2apic mode is disabled for reasons like interrupt-remapping not available etc, then we need to skip the logical cpu bringup of apic-id's >= 255. Otherwise as the platform is in xapic mode, init/startup IPI's will consider only the low 8-bits and there is a possibility of re-sending init/startup IPI's to the logical cpu that is already online. This will avoid potential reboots/unpredictable behavior etc. Signed-off-by: Suresh Siddha Link: http://lkml.kernel.org/r/20111222014632.702932458@sbsiddha-desk.sc.intel.com Signed-off-by: H. Peter Anvin --- arch/x86/kernel/smpboot.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index 9f548cb..e38e217 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -840,7 +840,8 @@ int __cpuinit native_cpu_up(unsigned int cpu) pr_debug("++++++++++++++++++++=_---CPU UP %u\n", cpu); if (apicid == BAD_APICID || apicid == boot_cpu_physical_apicid || - !physid_isset(apicid, phys_cpu_present_map)) { + !physid_isset(apicid, phys_cpu_present_map) || + (!x2apic_mode && apicid >= 255)) { printk(KERN_ERR "%s: bad cpu %d\n", __func__, cpu); return -EINVAL; } -- cgit v1.1 From 2c9ede55ecec58099b72e4bb8eab719f32f72c31 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 23 Jul 2011 20:24:48 -0400 Subject: switch device_get_devnode() and ->devnode() to umode_t * both callers of device_get_devnode() are only interested in lower 16bits and nobody tries to return anything wider than 16bit anyway. Signed-off-by: Al Viro --- arch/x86/kernel/cpuid.c | 2 +- arch/x86/kernel/msr.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/cpuid.c b/arch/x86/kernel/cpuid.c index 212a6a4..a524353 100644 --- a/arch/x86/kernel/cpuid.c +++ b/arch/x86/kernel/cpuid.c @@ -177,7 +177,7 @@ static struct notifier_block __refdata cpuid_class_cpu_notifier = .notifier_call = cpuid_class_cpu_callback, }; -static char *cpuid_devnode(struct device *dev, mode_t *mode) +static char *cpuid_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/cpuid", MINOR(dev->devt)); } diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c index 12fcbe2..9635676 100644 --- a/arch/x86/kernel/msr.c +++ b/arch/x86/kernel/msr.c @@ -236,7 +236,7 @@ static struct notifier_block __refdata msr_class_cpu_notifier = { .notifier_call = msr_class_cpu_callback, }; -static char *msr_devnode(struct device *dev, mode_t *mode) +static char *msr_devnode(struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "cpu/%u/msr", MINOR(dev->devt)); } -- cgit v1.1