diff options
Diffstat (limited to 'arch/x86/kernel')
63 files changed, 1799 insertions, 1942 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index cdb1b70..c887cd9 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -32,6 +32,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += mcount_64.o obj-y += syscall_$(BITS).o vsyscall_gtod.o +obj-$(CONFIG_IA32_EMULATION) += syscall_32.o obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 703130f..aef6531 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str) __setup("noreplace-paravirt", setup_noreplace_paravirt); #endif -#define DPRINTK(fmt, ...) \ -do { \ - if (debug_alternative) \ - printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +#define DPRINTK(fmt, args...) \ +do { \ + if (debug_alternative) \ + printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \ +} while (0) + +#define DUMP_BYTES(buf, len, fmt, args...) \ +do { \ + if (unlikely(debug_alternative)) { \ + int j; \ + \ + if (!(len)) \ + break; \ + \ + printk(KERN_DEBUG fmt, ##args); \ + for (j = 0; j < (len) - 1; j++) \ + printk(KERN_CONT "%02hhx ", buf[j]); \ + printk(KERN_CONT "%02hhx\n", buf[j]); \ + } \ } while (0) /* @@ -243,12 +258,89 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void *text_poke_early(void *addr, const void *opcode, size_t len); -/* Replace instructions with better alternatives for this CPU type. - This runs before SMP is initialized to avoid SMP problems with - self modifying code. This implies that asymmetric systems where - APs have less capabilities than the boot processor are not handled. - Tough. Make sure you disable such features by hand. */ +/* + * Are we looking at a near JMP with a 1 or 4-byte displacement. + */ +static inline bool is_jmp(const u8 opcode) +{ + return opcode == 0xeb || opcode == 0xe9; +} + +static void __init_or_module +recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) +{ + u8 *next_rip, *tgt_rip; + s32 n_dspl, o_dspl; + int repl_len; + + if (a->replacementlen != 5) + return; + + o_dspl = *(s32 *)(insnbuf + 1); + + /* next_rip of the replacement JMP */ + next_rip = repl_insn + a->replacementlen; + /* target rip of the replacement JMP */ + tgt_rip = next_rip + o_dspl; + n_dspl = tgt_rip - orig_insn; + + DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl); + + if (tgt_rip - orig_insn >= 0) { + if (n_dspl - 2 <= 127) + goto two_byte_jmp; + else + goto five_byte_jmp; + /* negative offset */ + } else { + if (((n_dspl - 2) & 0xff) == (n_dspl - 2)) + goto two_byte_jmp; + else + goto five_byte_jmp; + } + +two_byte_jmp: + n_dspl -= 2; + + insnbuf[0] = 0xeb; + insnbuf[1] = (s8)n_dspl; + add_nops(insnbuf + 2, 3); + + repl_len = 2; + goto done; + +five_byte_jmp: + n_dspl -= 5; + + insnbuf[0] = 0xe9; + *(s32 *)&insnbuf[1] = n_dspl; + repl_len = 5; + +done: + + DPRINTK("final displ: 0x%08x, JMP 0x%lx", + n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); +} + +static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr) +{ + if (instr[0] != 0x90) + return; + + add_nops(instr + (a->instrlen - a->padlen), a->padlen); + + DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ", + instr, a->instrlen - a->padlen, a->padlen); +} + +/* + * Replace instructions with better alternatives for this CPU type. This runs + * before SMP is initialized to avoid SMP problems with self modifying code. + * This implies that asymmetric systems where APs have less capabilities than + * the boot processor are not handled. Tough. Make sure you disable such + * features by hand. + */ void __init_or_module apply_alternatives(struct alt_instr *start, struct alt_instr *end) { @@ -256,10 +348,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start, u8 *instr, *replacement; u8 insnbuf[MAX_PATCH_LEN]; - DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); + DPRINTK("alt table %p -> %p", start, end); /* * The scan order should be from start to end. A later scanned - * alternative code can overwrite a previous scanned alternative code. + * alternative code can overwrite previously scanned alternative code. * Some kernel functions (e.g. memcpy, memset, etc) use this order to * patch code. * @@ -267,29 +359,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start, * order. */ for (a = start; a < end; a++) { + int insnbuf_sz = 0; + instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; - BUG_ON(a->replacementlen > a->instrlen); BUG_ON(a->instrlen > sizeof(insnbuf)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); - if (!boot_cpu_has(a->cpuid)) + if (!boot_cpu_has(a->cpuid)) { + if (a->padlen > 1) + optimize_nops(a, instr); + continue; + } + + DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d", + a->cpuid >> 5, + a->cpuid & 0x1f, + instr, a->instrlen, + replacement, a->replacementlen, a->padlen); + + DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr); + DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement); memcpy(insnbuf, replacement, a->replacementlen); + insnbuf_sz = a->replacementlen; /* 0xe8 is a relative jump; fix the offset. */ - if (*insnbuf == 0xe8 && a->replacementlen == 5) - *(s32 *)(insnbuf + 1) += replacement - instr; + if (*insnbuf == 0xe8 && a->replacementlen == 5) { + *(s32 *)(insnbuf + 1) += replacement - instr; + DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", + *(s32 *)(insnbuf + 1), + (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5); + } + + if (a->replacementlen && is_jmp(replacement[0])) + recompute_jump(a, instr, replacement, insnbuf); - add_nops(insnbuf + a->replacementlen, - a->instrlen - a->replacementlen); + if (a->instrlen > a->replacementlen) { + add_nops(insnbuf + a->replacementlen, + a->instrlen - a->replacementlen); + insnbuf_sz += a->instrlen - a->replacementlen; + } + DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr); - text_poke_early(instr, insnbuf, a->instrlen); + text_poke_early(instr, insnbuf, insnbuf_sz); } } #ifdef CONFIG_SMP - static void alternatives_smp_lock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) { @@ -371,8 +488,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, smp->locks_end = locks_end; smp->text = text; smp->text_end = text_end; - DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", - __func__, smp->locks, smp->locks_end, + DPRINTK("locks %p -> %p, text %p -> %p, name %s\n", + smp->locks, smp->locks_end, smp->text, smp->text_end, smp->name); list_add_tail(&smp->next, &smp_alt_modules); @@ -440,7 +557,7 @@ int alternatives_text_reserved(void *start, void *end) return 0; } -#endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_PARAVIRT void __init_or_module apply_paravirt(struct paravirt_patch_site *start, @@ -601,7 +718,7 @@ int poke_int3_handler(struct pt_regs *regs) if (likely(!bp_patching_in_progress)) return 0; - if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr) + if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr) return 0; /* set up the specified breakpoint handler */ diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index ad3639a..dcb5285 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -1084,67 +1084,6 @@ void lapic_shutdown(void) local_irq_restore(flags); } -/* - * This is to verify that we're looking at a real local APIC. - * Check these against your board if the CPUs aren't getting - * started for no apparent reason. - */ -int __init verify_local_APIC(void) -{ - unsigned int reg0, reg1; - - /* - * The version register is read-only in a real APIC. - */ - reg0 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0); - apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK); - reg1 = apic_read(APIC_LVR); - apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1); - - /* - * The two version reads above should print the same - * numbers. If the second one is different, then we - * poke at a non-APIC. - */ - if (reg1 != reg0) - return 0; - - /* - * Check if the version looks reasonably. - */ - reg1 = GET_APIC_VERSION(reg0); - if (reg1 == 0x00 || reg1 == 0xff) - return 0; - reg1 = lapic_get_maxlvt(); - if (reg1 < 0x02 || reg1 == 0xff) - return 0; - - /* - * The ID register is read/write in a real APIC. - */ - reg0 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0); - apic_write(APIC_ID, reg0 ^ apic->apic_id_mask); - reg1 = apic_read(APIC_ID); - apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1); - apic_write(APIC_ID, reg0); - if (reg1 != (reg0 ^ apic->apic_id_mask)) - return 0; - - /* - * The next two are just to see if we have sane values. - * They're only really relevant if we're in Virtual Wire - * compatibility mode, but most boxes are anymore. - */ - reg0 = apic_read(APIC_LVT0); - apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0); - reg1 = apic_read(APIC_LVT1); - apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1); - - return 1; -} - /** * sync_Arb_IDs - synchronize APIC bus arbitration IDs */ @@ -2283,7 +2222,6 @@ int __init APIC_init_uniprocessor(void) disable_ioapic_support(); default_setup_apic_routing(); - verify_local_APIC(); apic_bsp_setup(true); return 0; } diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index e658f21..d9d0bd2 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -135,12 +135,12 @@ static void init_x2apic_ldr(void) per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR); - __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu)); + cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, this_cpu)); for_each_online_cpu(cpu) { if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu)) continue; - __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu)); - __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu)); + cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu)); + cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu)); } } @@ -195,7 +195,7 @@ static int x2apic_init_cpu_notifier(void) BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu)); - __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu)); + cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu)); register_hotcpu_notifier(&x2apic_cpu_notifier); return 1; } diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c index 8e9dcfd..c8d9295 100644 --- a/arch/x86/kernel/apic/x2apic_uv_x.c +++ b/arch/x86/kernel/apic/x2apic_uv_x.c @@ -144,33 +144,60 @@ static void __init uv_set_apicid_hibit(void) static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { - int pnodeid, is_uv1, is_uv2, is_uv3; - - is_uv1 = !strcmp(oem_id, "SGI"); - is_uv2 = !strcmp(oem_id, "SGI2"); - is_uv3 = !strncmp(oem_id, "SGI3", 4); /* there are varieties of UV3 */ - if (is_uv1 || is_uv2 || is_uv3) { - uv_hub_info->hub_revision = - (is_uv1 ? UV1_HUB_REVISION_BASE : - (is_uv2 ? UV2_HUB_REVISION_BASE : - UV3_HUB_REVISION_BASE)); - pnodeid = early_get_pnodeid(); - early_get_apic_pnode_shift(); - x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; - x86_platform.nmi_init = uv_nmi_init; - if (!strcmp(oem_table_id, "UVL")) - uv_system_type = UV_LEGACY_APIC; - else if (!strcmp(oem_table_id, "UVX")) - uv_system_type = UV_X2APIC; - else if (!strcmp(oem_table_id, "UVH")) { - __this_cpu_write(x2apic_extra_bits, - pnodeid << uvh_apicid.s.pnode_shift); - uv_system_type = UV_NON_UNIQUE_APIC; - uv_set_apicid_hibit(); - return 1; - } + int pnodeid; + int uv_apic; + + if (strncmp(oem_id, "SGI", 3) != 0) + return 0; + + /* + * Determine UV arch type. + * SGI: UV100/1000 + * SGI2: UV2000/3000 + * SGI3: UV300 (truncated to 4 chars because of different varieties) + */ + uv_hub_info->hub_revision = + !strncmp(oem_id, "SGI3", 4) ? UV3_HUB_REVISION_BASE : + !strcmp(oem_id, "SGI2") ? UV2_HUB_REVISION_BASE : + !strcmp(oem_id, "SGI") ? UV1_HUB_REVISION_BASE : 0; + + if (uv_hub_info->hub_revision == 0) + goto badbios; + + pnodeid = early_get_pnodeid(); + early_get_apic_pnode_shift(); + x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range; + x86_platform.nmi_init = uv_nmi_init; + + if (!strcmp(oem_table_id, "UVX")) { /* most common */ + uv_system_type = UV_X2APIC; + uv_apic = 0; + + } else if (!strcmp(oem_table_id, "UVH")) { /* only UV1 systems */ + uv_system_type = UV_NON_UNIQUE_APIC; + __this_cpu_write(x2apic_extra_bits, + pnodeid << uvh_apicid.s.pnode_shift); + uv_set_apicid_hibit(); + uv_apic = 1; + + } else if (!strcmp(oem_table_id, "UVL")) { /* only used for */ + uv_system_type = UV_LEGACY_APIC; /* very small systems */ + uv_apic = 0; + + } else { + goto badbios; } - return 0; + + pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n", + oem_id, oem_table_id, uv_system_type, + uv_min_hub_revision_id, uv_apic); + + return uv_apic; + +badbios: + pr_err("UV: OEM_ID:%s OEM_TABLE_ID:%s\n", oem_id, oem_table_id); + pr_err("Current BIOS not supported, update kernel and/or BIOS\n"); + BUG(); } enum uv_system_type get_uv_system_type(void) @@ -854,10 +881,14 @@ void __init uv_system_init(void) unsigned long mmr_base, present, paddr; unsigned short pnode_mask; unsigned char n_lshift; - char *hub = (is_uv1_hub() ? "UV1" : - (is_uv2_hub() ? "UV2" : - "UV3")); + char *hub = (is_uv1_hub() ? "UV100/1000" : + (is_uv2_hub() ? "UV2000/3000" : + (is_uv3_hub() ? "UV300" : NULL))); + if (!hub) { + pr_err("UV: Unknown/unsupported UV hub\n"); + return; + } pr_info("UV: Found %s hub\n", hub); map_low_mmrs(); diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 3b3b9d3..47703ae 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -68,7 +68,7 @@ void foo(void) /* Offset from the sysenter stack to tss.sp0 */ DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - - sizeof(struct tss_struct)); + offsetofend(struct tss_struct, SYSENTER_stack)); #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index fdcbb4d..5ce6f2d 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -81,6 +81,7 @@ int main(void) #undef ENTRY OFFSET(TSS_ist, tss_struct, x86_tss.ist); + OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); BLANK(); DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a220239..fd470eb 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -5,6 +5,7 @@ #include <linux/io.h> #include <linux/sched.h> +#include <linux/random.h> #include <asm/processor.h> #include <asm/apic.h> #include <asm/cpu.h> @@ -488,6 +489,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) va_align.mask = (upperbit - 1) & PAGE_MASK; va_align.flags = ALIGN_VA_32 | ALIGN_VA_64; + + /* A random value per boot for bit slice [12:upper_bit) */ + va_align.bits = get_random_int() & va_align.mask; } } @@ -711,6 +715,11 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); + + /* 3DNow or LM implies PREFETCHW */ + if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) + if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) + set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 1cd4a1a..a62cf04 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -998,38 +998,37 @@ static void identify_cpu(struct cpuinfo_x86 *c) #endif } -#ifdef CONFIG_X86_64 -#ifdef CONFIG_IA32_EMULATION -/* May not be __init: called during resume */ -static void syscall32_cpu_init(void) -{ - /* Load these always in case some future AMD CPU supports - SYSENTER from compat mode too. */ - wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); - wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); - - wrmsrl(MSR_CSTAR, ia32_cstar_target); -} -#endif /* CONFIG_IA32_EMULATION */ -#endif /* CONFIG_X86_64 */ - +/* + * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions + * on 32-bit kernels: + */ #ifdef CONFIG_X86_32 void enable_sep_cpu(void) { - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + struct tss_struct *tss; + int cpu; - if (!boot_cpu_has(X86_FEATURE_SEP)) { - put_cpu(); - return; - } + cpu = get_cpu(); + tss = &per_cpu(cpu_tss, cpu); + + if (!boot_cpu_has(X86_FEATURE_SEP)) + goto out; + + /* + * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- + * see the big comment in struct x86_hw_tss's definition. + */ tss->x86_tss.ss1 = __KERNEL_CS; - tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; - wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); - wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); + wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); + + wrmsr(MSR_IA32_SYSENTER_ESP, + (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), + 0); + + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0); + +out: put_cpu(); } #endif @@ -1157,7 +1156,7 @@ static __init int setup_disablecpuid(char *arg) __setup("clearcpuid=", setup_disablecpuid); DEFINE_PER_CPU(unsigned long, kernel_stack) = - (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE; + (unsigned long)&init_thread_union + THREAD_SIZE; EXPORT_PER_CPU_SYMBOL(kernel_stack); #ifdef CONFIG_X86_64 @@ -1169,8 +1168,8 @@ DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE) __visible; /* - * The following four percpu variables are hot. Align current_task to - * cacheline size such that all four fall in the same cacheline. + * The following percpu variables are hot. Align current_task to + * cacheline size such that they fall in the same cacheline. */ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; @@ -1210,10 +1209,23 @@ void syscall_init(void) */ wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32); wrmsrl(MSR_LSTAR, system_call); - wrmsrl(MSR_CSTAR, ignore_sysret); #ifdef CONFIG_IA32_EMULATION - syscall32_cpu_init(); + wrmsrl(MSR_CSTAR, ia32_cstar_target); + /* + * This only works on Intel CPUs. + * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP. + * This does not cause SYSENTER to jump to the wrong location, because + * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). + */ + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); +#else + wrmsrl(MSR_CSTAR, ignore_sysret); + wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL); #endif /* Flags to clear on syscall */ @@ -1265,6 +1277,15 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); +/* + * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find + * the top of the kernel stack. Use an extra percpu variable to track the + * top of the kernel stack directly. + */ +DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = + (unsigned long)&init_thread_union + THREAD_SIZE; +EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); + #ifdef CONFIG_CC_STACKPROTECTOR DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif @@ -1346,7 +1367,7 @@ void cpu_init(void) */ load_ucode_ap(); - t = &per_cpu(init_tss, cpu); + t = &per_cpu(cpu_tss, cpu); oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA @@ -1430,7 +1451,7 @@ void cpu_init(void) { int cpu = smp_processor_id(); struct task_struct *curr = current; - struct tss_struct *t = &per_cpu(init_tss, cpu); + struct tss_struct *t = &per_cpu(cpu_tss, cpu); struct thread_struct *thread = &curr->thread; wait_for_master_cpu(cpu); diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 6596433..edcb0e2 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -7,16 +7,14 @@ * Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD. */ -#include <linux/init.h> #include <linux/slab.h> -#include <linux/device.h> -#include <linux/compiler.h> +#include <linux/cacheinfo.h> #include <linux/cpu.h> #include <linux/sched.h> +#include <linux/sysfs.h> #include <linux/pci.h> #include <asm/processor.h> -#include <linux/smp.h> #include <asm/amd_nb.h> #include <asm/smp.h> @@ -116,10 +114,10 @@ static const struct _cache_table cache_table[] = enum _cache_type { - CACHE_TYPE_NULL = 0, - CACHE_TYPE_DATA = 1, - CACHE_TYPE_INST = 2, - CACHE_TYPE_UNIFIED = 3 + CTYPE_NULL = 0, + CTYPE_DATA = 1, + CTYPE_INST = 2, + CTYPE_UNIFIED = 3 }; union _cpuid4_leaf_eax { @@ -159,11 +157,6 @@ struct _cpuid4_info_regs { struct amd_northbridge *nb; }; -struct _cpuid4_info { - struct _cpuid4_info_regs base; - DECLARE_BITMAP(shared_cpu_map, NR_CPUS); -}; - unsigned short num_cache_leaves; /* AMD doesn't have CPUID4. Emulate it here to report the same @@ -220,6 +213,13 @@ static const unsigned short assocs[] = { static const unsigned char levels[] = { 1, 1, 2, 3 }; static const unsigned char types[] = { 1, 2, 3, 3 }; +static const enum cache_type cache_type_map[] = { + [CTYPE_NULL] = CACHE_TYPE_NOCACHE, + [CTYPE_DATA] = CACHE_TYPE_DATA, + [CTYPE_INST] = CACHE_TYPE_INST, + [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED, +}; + static void amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, union _cpuid4_leaf_ebx *ebx, @@ -291,14 +291,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, (ebx->split.ways_of_associativity + 1) - 1; } -struct _cache_attr { - struct attribute attr; - ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int); - ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count, - unsigned int); -}; - #if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS) + /* * L3 cache descriptors */ @@ -325,20 +319,6 @@ static void amd_calc_l3_indices(struct amd_northbridge *nb) l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1; } -static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) -{ - int node; - - /* only for L3, and not in virtualized environments */ - if (index < 3) - return; - - node = amd_get_nb_id(smp_processor_id()); - this_leaf->nb = node_to_amd_nb(node); - if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) - amd_calc_l3_indices(this_leaf->nb); -} - /* * check whether a slot used for disabling an L3 index is occupied. * @l3: L3 cache descriptor @@ -359,15 +339,13 @@ int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot) return -1; } -static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, +static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf, unsigned int slot) { int index; + struct amd_northbridge *nb = this_leaf->priv; - if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) - return -EINVAL; - - index = amd_get_l3_disable_slot(this_leaf->base.nb, slot); + index = amd_get_l3_disable_slot(nb, slot); if (index >= 0) return sprintf(buf, "%d\n", index); @@ -376,9 +354,10 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, #define SHOW_CACHE_DISABLE(slot) \ static ssize_t \ -show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \ - unsigned int cpu) \ +cache_disable_##slot##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ { \ + struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ return show_cache_disable(this_leaf, buf, slot); \ } SHOW_CACHE_DISABLE(0) @@ -446,25 +425,23 @@ int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot, return 0; } -static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, - const char *buf, size_t count, - unsigned int slot) +static ssize_t store_cache_disable(struct cacheinfo *this_leaf, + const char *buf, size_t count, + unsigned int slot) { unsigned long val = 0; int cpu, err = 0; + struct amd_northbridge *nb = this_leaf->priv; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) - return -EINVAL; - - cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + cpu = cpumask_first(&this_leaf->shared_cpu_map); if (kstrtoul(buf, 10, &val) < 0) return -EINVAL; - err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val); + err = amd_set_l3_disable_slot(nb, cpu, slot, val); if (err) { if (err == -EEXIST) pr_warning("L3 slot %d in use/index already disabled!\n", @@ -476,41 +453,36 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, #define STORE_CACHE_DISABLE(slot) \ static ssize_t \ -store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \ - const char *buf, size_t count, \ - unsigned int cpu) \ +cache_disable_##slot##_store(struct device *dev, \ + struct device_attribute *attr, \ + const char *buf, size_t count) \ { \ + struct cacheinfo *this_leaf = dev_get_drvdata(dev); \ return store_cache_disable(this_leaf, buf, count, slot); \ } STORE_CACHE_DISABLE(0) STORE_CACHE_DISABLE(1) -static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, - show_cache_disable_0, store_cache_disable_0); -static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, - show_cache_disable_1, store_cache_disable_1); - -static ssize_t -show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu) +static ssize_t subcaches_show(struct device *dev, + struct device_attribute *attr, char *buf) { - if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - return -EINVAL; + struct cacheinfo *this_leaf = dev_get_drvdata(dev); + int cpu = cpumask_first(&this_leaf->shared_cpu_map); return sprintf(buf, "%x\n", amd_get_subcaches(cpu)); } -static ssize_t -store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, - unsigned int cpu) +static ssize_t subcaches_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { + struct cacheinfo *this_leaf = dev_get_drvdata(dev); + int cpu = cpumask_first(&this_leaf->shared_cpu_map); unsigned long val; if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - return -EINVAL; - if (kstrtoul(buf, 16, &val) < 0) return -EINVAL; @@ -520,9 +492,92 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count, return count; } -static struct _cache_attr subcaches = - __ATTR(subcaches, 0644, show_subcaches, store_subcaches); +static DEVICE_ATTR_RW(cache_disable_0); +static DEVICE_ATTR_RW(cache_disable_1); +static DEVICE_ATTR_RW(subcaches); + +static umode_t +cache_private_attrs_is_visible(struct kobject *kobj, + struct attribute *attr, int unused) +{ + struct device *dev = kobj_to_dev(kobj); + struct cacheinfo *this_leaf = dev_get_drvdata(dev); + umode_t mode = attr->mode; + + if (!this_leaf->priv) + return 0; + + if ((attr == &dev_attr_subcaches.attr) && + amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + return mode; + + if ((attr == &dev_attr_cache_disable_0.attr || + attr == &dev_attr_cache_disable_1.attr) && + amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + return mode; + + return 0; +} + +static struct attribute_group cache_private_group = { + .is_visible = cache_private_attrs_is_visible, +}; + +static void init_amd_l3_attrs(void) +{ + int n = 1; + static struct attribute **amd_l3_attrs; + + if (amd_l3_attrs) /* already initialized */ + return; + + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) + n += 2; + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + n += 1; + + amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL); + if (!amd_l3_attrs) + return; + + n = 0; + if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { + amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr; + amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr; + } + if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) + amd_l3_attrs[n++] = &dev_attr_subcaches.attr; + cache_private_group.attrs = amd_l3_attrs; +} + +const struct attribute_group * +cache_get_priv_group(struct cacheinfo *this_leaf) +{ + struct amd_northbridge *nb = this_leaf->priv; + + if (this_leaf->level < 3 || !nb) + return NULL; + + if (nb && nb->l3_cache.indices) + init_amd_l3_attrs(); + + return &cache_private_group; +} + +static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index) +{ + int node; + + /* only for L3, and not in virtualized environments */ + if (index < 3) + return; + + node = amd_get_nb_id(smp_processor_id()); + this_leaf->nb = node_to_amd_nb(node); + if (this_leaf->nb && !this_leaf->nb->l3_cache.indices) + amd_calc_l3_indices(this_leaf->nb); +} #else #define amd_init_l3_cache(x, y) #endif /* CONFIG_AMD_NB && CONFIG_SYSFS */ @@ -546,7 +601,7 @@ cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf) cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx); } - if (eax.split.type == CACHE_TYPE_NULL) + if (eax.split.type == CTYPE_NULL) return -EIO; /* better error ? */ this_leaf->eax = eax; @@ -575,7 +630,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c) /* Do cpuid(op) loop to find out num_cache_leaves */ cpuid_count(op, i, &eax, &ebx, &ecx, &edx); cache_eax.full = eax; - } while (cache_eax.split.type != CACHE_TYPE_NULL); + } while (cache_eax.split.type != CTYPE_NULL); return i; } @@ -626,9 +681,9 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) switch (this_leaf.eax.split.level) { case 1: - if (this_leaf.eax.split.type == CACHE_TYPE_DATA) + if (this_leaf.eax.split.type == CTYPE_DATA) new_l1d = this_leaf.size/1024; - else if (this_leaf.eax.split.type == CACHE_TYPE_INST) + else if (this_leaf.eax.split.type == CTYPE_INST) new_l1i = this_leaf.size/1024; break; case 2: @@ -747,55 +802,52 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c) return l2; } -#ifdef CONFIG_SYSFS - -/* pointer to _cpuid4_info array (for each cache leaf) */ -static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info); -#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y])) - -#ifdef CONFIG_SMP - -static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) +static int __cache_amd_cpumap_setup(unsigned int cpu, int index, + struct _cpuid4_info_regs *base) { - struct _cpuid4_info *this_leaf; + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cacheinfo *this_leaf; int i, sibling; if (cpu_has_topoext) { unsigned int apicid, nshared, first, last; - if (!per_cpu(ici_cpuid4_info, cpu)) - return 0; - - this_leaf = CPUID4_INFO_IDX(cpu, index); - nshared = this_leaf->base.eax.split.num_threads_sharing + 1; + this_leaf = this_cpu_ci->info_list + index; + nshared = base->eax.split.num_threads_sharing + 1; apicid = cpu_data(cpu).apicid; first = apicid - (apicid % nshared); last = first + nshared - 1; for_each_online_cpu(i) { + this_cpu_ci = get_cpu_cacheinfo(i); + if (!this_cpu_ci->info_list) + continue; + apicid = cpu_data(i).apicid; if ((apicid < first) || (apicid > last)) continue; - if (!per_cpu(ici_cpuid4_info, i)) - continue; - this_leaf = CPUID4_INFO_IDX(i, index); + + this_leaf = this_cpu_ci->info_list + index; for_each_online_cpu(sibling) { apicid = cpu_data(sibling).apicid; if ((apicid < first) || (apicid > last)) continue; - set_bit(sibling, this_leaf->shared_cpu_map); + cpumask_set_cpu(sibling, + &this_leaf->shared_cpu_map); } } } else if (index == 3) { for_each_cpu(i, cpu_llc_shared_mask(cpu)) { - if (!per_cpu(ici_cpuid4_info, i)) + this_cpu_ci = get_cpu_cacheinfo(i); + if (!this_cpu_ci->info_list) continue; - this_leaf = CPUID4_INFO_IDX(i, index); + this_leaf = this_cpu_ci->info_list + index; for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { if (!cpu_online(sibling)) continue; - set_bit(sibling, this_leaf->shared_cpu_map); + cpumask_set_cpu(sibling, + &this_leaf->shared_cpu_map); } } } else @@ -804,457 +856,86 @@ static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index) return 1; } -static void cache_shared_cpu_map_setup(unsigned int cpu, int index) +static void __cache_cpumap_setup(unsigned int cpu, int index, + struct _cpuid4_info_regs *base) { - struct _cpuid4_info *this_leaf, *sibling_leaf; + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cacheinfo *this_leaf, *sibling_leaf; unsigned long num_threads_sharing; int index_msb, i; struct cpuinfo_x86 *c = &cpu_data(cpu); if (c->x86_vendor == X86_VENDOR_AMD) { - if (cache_shared_amd_cpu_map_setup(cpu, index)) + if (__cache_amd_cpumap_setup(cpu, index, base)) return; } - this_leaf = CPUID4_INFO_IDX(cpu, index); - num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing; + this_leaf = this_cpu_ci->info_list + index; + num_threads_sharing = 1 + base->eax.split.num_threads_sharing; + cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map); if (num_threads_sharing == 1) - cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map)); - else { - index_msb = get_count_order(num_threads_sharing); - - for_each_online_cpu(i) { - if (cpu_data(i).apicid >> index_msb == - c->apicid >> index_msb) { - cpumask_set_cpu(i, - to_cpumask(this_leaf->shared_cpu_map)); - if (i != cpu && per_cpu(ici_cpuid4_info, i)) { - sibling_leaf = - CPUID4_INFO_IDX(i, index); - cpumask_set_cpu(cpu, to_cpumask( - sibling_leaf->shared_cpu_map)); - } - } - } - } -} -static void cache_remove_shared_cpu_map(unsigned int cpu, int index) -{ - struct _cpuid4_info *this_leaf, *sibling_leaf; - int sibling; - - this_leaf = CPUID4_INFO_IDX(cpu, index); - for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) { - sibling_leaf = CPUID4_INFO_IDX(sibling, index); - cpumask_clear_cpu(cpu, - to_cpumask(sibling_leaf->shared_cpu_map)); - } -} -#else -static void cache_shared_cpu_map_setup(unsigned int cpu, int index) -{ -} - -static void cache_remove_shared_cpu_map(unsigned int cpu, int index) -{ -} -#endif - -static void free_cache_attributes(unsigned int cpu) -{ - int i; - - for (i = 0; i < num_cache_leaves; i++) - cache_remove_shared_cpu_map(cpu, i); - - kfree(per_cpu(ici_cpuid4_info, cpu)); - per_cpu(ici_cpuid4_info, cpu) = NULL; -} - -static void get_cpu_leaves(void *_retval) -{ - int j, *retval = _retval, cpu = smp_processor_id(); + return; - /* Do cpuid and store the results */ - for (j = 0; j < num_cache_leaves; j++) { - struct _cpuid4_info *this_leaf = CPUID4_INFO_IDX(cpu, j); + index_msb = get_count_order(num_threads_sharing); - *retval = cpuid4_cache_lookup_regs(j, &this_leaf->base); - if (unlikely(*retval < 0)) { - int i; + for_each_online_cpu(i) + if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) { + struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i); - for (i = 0; i < j; i++) - cache_remove_shared_cpu_map(cpu, i); - break; + if (i == cpu || !sib_cpu_ci->info_list) + continue;/* skip if itself or no cacheinfo */ + sibling_leaf = sib_cpu_ci->info_list + index; + cpumask_set_cpu(i, &this_leaf->shared_cpu_map); + cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map); } - cache_shared_cpu_map_setup(cpu, j); - } } -static int detect_cache_attributes(unsigned int cpu) +static void ci_leaf_init(struct cacheinfo *this_leaf, + struct _cpuid4_info_regs *base) { - int retval; - - if (num_cache_leaves == 0) - return -ENOENT; - - per_cpu(ici_cpuid4_info, cpu) = kzalloc( - sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL); - if (per_cpu(ici_cpuid4_info, cpu) == NULL) - return -ENOMEM; - - smp_call_function_single(cpu, get_cpu_leaves, &retval, true); - if (retval) { - kfree(per_cpu(ici_cpuid4_info, cpu)); - per_cpu(ici_cpuid4_info, cpu) = NULL; - } - - return retval; + this_leaf->level = base->eax.split.level; + this_leaf->type = cache_type_map[base->eax.split.type]; + this_leaf->coherency_line_size = + base->ebx.split.coherency_line_size + 1; + this_leaf->ways_of_associativity = + base->ebx.split.ways_of_associativity + 1; + this_leaf->size = base->size; + this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1; + this_leaf->physical_line_partition = + base->ebx.split.physical_line_partition + 1; + this_leaf->priv = base->nb; } -#include <linux/kobject.h> -#include <linux/sysfs.h> -#include <linux/cpu.h> - -/* pointer to kobject for cpuX/cache */ -static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject); - -struct _index_kobject { - struct kobject kobj; - unsigned int cpu; - unsigned short index; -}; - -/* pointer to array of kobjects for cpuX/cache/indexY */ -static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject); -#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y])) - -#define show_one_plus(file_name, object, val) \ -static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \ - unsigned int cpu) \ -{ \ - return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \ -} - -show_one_plus(level, base.eax.split.level, 0); -show_one_plus(coherency_line_size, base.ebx.split.coherency_line_size, 1); -show_one_plus(physical_line_partition, base.ebx.split.physical_line_partition, 1); -show_one_plus(ways_of_associativity, base.ebx.split.ways_of_associativity, 1); -show_one_plus(number_of_sets, base.ecx.split.number_of_sets, 1); - -static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf, - unsigned int cpu) -{ - return sprintf(buf, "%luK\n", this_leaf->base.size / 1024); -} - -static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf, - int type, char *buf) -{ - const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); - int ret; - - if (type) - ret = scnprintf(buf, PAGE_SIZE - 1, "%*pbl", - cpumask_pr_args(mask)); - else - ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb", - cpumask_pr_args(mask)); - buf[ret++] = '\n'; - buf[ret] = '\0'; - return ret; -} - -static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf, - unsigned int cpu) +static int __init_cache_level(unsigned int cpu) { - return show_shared_cpu_map_func(leaf, 0, buf); -} - -static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf, - unsigned int cpu) -{ - return show_shared_cpu_map_func(leaf, 1, buf); -} + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); -static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf, - unsigned int cpu) -{ - switch (this_leaf->base.eax.split.type) { - case CACHE_TYPE_DATA: - return sprintf(buf, "Data\n"); - case CACHE_TYPE_INST: - return sprintf(buf, "Instruction\n"); - case CACHE_TYPE_UNIFIED: - return sprintf(buf, "Unified\n"); - default: - return sprintf(buf, "Unknown\n"); - } -} - -#define to_object(k) container_of(k, struct _index_kobject, kobj) -#define to_attr(a) container_of(a, struct _cache_attr, attr) - -#define define_one_ro(_name) \ -static struct _cache_attr _name = \ - __ATTR(_name, 0444, show_##_name, NULL) - -define_one_ro(level); -define_one_ro(type); -define_one_ro(coherency_line_size); -define_one_ro(physical_line_partition); -define_one_ro(ways_of_associativity); -define_one_ro(number_of_sets); -define_one_ro(size); -define_one_ro(shared_cpu_map); -define_one_ro(shared_cpu_list); - -static struct attribute *default_attrs[] = { - &type.attr, - &level.attr, - &coherency_line_size.attr, - &physical_line_partition.attr, - &ways_of_associativity.attr, - &number_of_sets.attr, - &size.attr, - &shared_cpu_map.attr, - &shared_cpu_list.attr, - NULL -}; - -#ifdef CONFIG_AMD_NB -static struct attribute **amd_l3_attrs(void) -{ - static struct attribute **attrs; - int n; - - if (attrs) - return attrs; - - n = ARRAY_SIZE(default_attrs); - - if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) - n += 2; - - if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - n += 1; - - attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL); - if (attrs == NULL) - return attrs = default_attrs; - - for (n = 0; default_attrs[n]; n++) - attrs[n] = default_attrs[n]; - - if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) { - attrs[n++] = &cache_disable_0.attr; - attrs[n++] = &cache_disable_1.attr; - } - - if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) - attrs[n++] = &subcaches.attr; - - return attrs; -} -#endif - -static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf) -{ - struct _cache_attr *fattr = to_attr(attr); - struct _index_kobject *this_leaf = to_object(kobj); - ssize_t ret; - - ret = fattr->show ? - fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), - buf, this_leaf->cpu) : - 0; - return ret; -} - -static ssize_t store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t count) -{ - struct _cache_attr *fattr = to_attr(attr); - struct _index_kobject *this_leaf = to_object(kobj); - ssize_t ret; - - ret = fattr->store ? - fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index), - buf, count, this_leaf->cpu) : - 0; - return ret; -} - -static const struct sysfs_ops sysfs_ops = { - .show = show, - .store = store, -}; - -static struct kobj_type ktype_cache = { - .sysfs_ops = &sysfs_ops, - .default_attrs = default_attrs, -}; - -static struct kobj_type ktype_percpu_entry = { - .sysfs_ops = &sysfs_ops, -}; - -static void cpuid4_cache_sysfs_exit(unsigned int cpu) -{ - kfree(per_cpu(ici_cache_kobject, cpu)); - kfree(per_cpu(ici_index_kobject, cpu)); - per_cpu(ici_cache_kobject, cpu) = NULL; - per_cpu(ici_index_kobject, cpu) = NULL; - free_cache_attributes(cpu); -} - -static int cpuid4_cache_sysfs_init(unsigned int cpu) -{ - int err; - - if (num_cache_leaves == 0) + if (!num_cache_leaves) return -ENOENT; - - err = detect_cache_attributes(cpu); - if (err) - return err; - - /* Allocate all required memory */ - per_cpu(ici_cache_kobject, cpu) = - kzalloc(sizeof(struct kobject), GFP_KERNEL); - if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL)) - goto err_out; - - per_cpu(ici_index_kobject, cpu) = kzalloc( - sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL); - if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL)) - goto err_out; - + if (!this_cpu_ci) + return -EINVAL; + this_cpu_ci->num_levels = 3; + this_cpu_ci->num_leaves = num_cache_leaves; return 0; - -err_out: - cpuid4_cache_sysfs_exit(cpu); - return -ENOMEM; } -static DECLARE_BITMAP(cache_dev_map, NR_CPUS); - -/* Add/Remove cache interface for CPU device */ -static int cache_add_dev(struct device *dev) +static int __populate_cache_leaves(unsigned int cpu) { - unsigned int cpu = dev->id; - unsigned long i, j; - struct _index_kobject *this_object; - struct _cpuid4_info *this_leaf; - int retval; - - retval = cpuid4_cache_sysfs_init(cpu); - if (unlikely(retval < 0)) - return retval; - - retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu), - &ktype_percpu_entry, - &dev->kobj, "%s", "cache"); - if (retval < 0) { - cpuid4_cache_sysfs_exit(cpu); - return retval; - } + unsigned int idx, ret; + struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu); + struct cacheinfo *this_leaf = this_cpu_ci->info_list; + struct _cpuid4_info_regs id4_regs = {}; - for (i = 0; i < num_cache_leaves; i++) { - this_object = INDEX_KOBJECT_PTR(cpu, i); - this_object->cpu = cpu; - this_object->index = i; - - this_leaf = CPUID4_INFO_IDX(cpu, i); - - ktype_cache.default_attrs = default_attrs; -#ifdef CONFIG_AMD_NB - if (this_leaf->base.nb) - ktype_cache.default_attrs = amd_l3_attrs(); -#endif - retval = kobject_init_and_add(&(this_object->kobj), - &ktype_cache, - per_cpu(ici_cache_kobject, cpu), - "index%1lu", i); - if (unlikely(retval)) { - for (j = 0; j < i; j++) - kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj)); - kobject_put(per_cpu(ici_cache_kobject, cpu)); - cpuid4_cache_sysfs_exit(cpu); - return retval; - } - kobject_uevent(&(this_object->kobj), KOBJ_ADD); + for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) { + ret = cpuid4_cache_lookup_regs(idx, &id4_regs); + if (ret) + return ret; + ci_leaf_init(this_leaf++, &id4_regs); + __cache_cpumap_setup(cpu, idx, &id4_regs); } - cpumask_set_cpu(cpu, to_cpumask(cache_dev_map)); - - kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD); return 0; } -static void cache_remove_dev(struct device *dev) -{ - unsigned int cpu = dev->id; - unsigned long i; - - if (per_cpu(ici_cpuid4_info, cpu) == NULL) - return; - if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map))) - return; - cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map)); - - for (i = 0; i < num_cache_leaves; i++) - kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj)); - kobject_put(per_cpu(ici_cache_kobject, cpu)); - cpuid4_cache_sysfs_exit(cpu); -} - -static int cacheinfo_cpu_callback(struct notifier_block *nfb, - unsigned long action, void *hcpu) -{ - unsigned int cpu = (unsigned long)hcpu; - struct device *dev; - - dev = get_cpu_device(cpu); - switch (action) { - case CPU_ONLINE: - case CPU_ONLINE_FROZEN: - cache_add_dev(dev); - break; - case CPU_DEAD: - case CPU_DEAD_FROZEN: - cache_remove_dev(dev); - break; - } - return NOTIFY_OK; -} - -static struct notifier_block cacheinfo_cpu_notifier = { - .notifier_call = cacheinfo_cpu_callback, -}; - -static int __init cache_sysfs_init(void) -{ - int i, err = 0; - - if (num_cache_leaves == 0) - return 0; - - cpu_notifier_register_begin(); - for_each_online_cpu(i) { - struct device *dev = get_cpu_device(i); - - err = cache_add_dev(dev); - if (err) - goto out; - } - __register_hotcpu_notifier(&cacheinfo_cpu_notifier); - -out: - cpu_notifier_register_done(); - return err; -} - -device_initcall(cache_sysfs_init); - -#endif +DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level) +DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves) diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h index 10b4690..fe32074 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-internal.h +++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h @@ -14,6 +14,7 @@ enum severity_level { }; #define ATTR_LEN 16 +#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */ /* One object for each MCE bank, shared by all CPUs */ struct mce_bank { @@ -23,20 +24,20 @@ struct mce_bank { char attrname[ATTR_LEN]; /* attribute name */ }; -int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp); +extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp); struct dentry *mce_get_debugfs_dir(void); extern struct mce_bank *mce_banks; extern mce_banks_t mce_banks_ce_disabled; #ifdef CONFIG_X86_MCE_INTEL -unsigned long mce_intel_adjust_timer(unsigned long interval); -void mce_intel_cmci_poll(void); +unsigned long cmci_intel_adjust_timer(unsigned long interval); +bool mce_intel_cmci_poll(void); void mce_intel_hcpu_update(unsigned long cpu); void cmci_disable_bank(int bank); #else -# define mce_intel_adjust_timer mce_adjust_timer_default -static inline void mce_intel_cmci_poll(void) { } +# define cmci_intel_adjust_timer mce_adjust_timer_default +static inline bool mce_intel_cmci_poll(void) { return false; } static inline void mce_intel_hcpu_update(unsigned long cpu) { } static inline void cmci_disable_bank(int bank) { } #endif diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c index 8bb4330..9c682c2 100644 --- a/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -186,7 +186,61 @@ static int error_context(struct mce *m) return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL; } -int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) +/* + * See AMD Error Scope Hierarchy table in a newer BKDG. For example + * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features" + */ +static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp) +{ + enum context ctx = error_context(m); + + /* Processor Context Corrupt, no need to fumble too much, die! */ + if (m->status & MCI_STATUS_PCC) + return MCE_PANIC_SEVERITY; + + if (m->status & MCI_STATUS_UC) { + + /* + * On older systems where overflow_recov flag is not present, we + * should simply panic if an error overflow occurs. If + * overflow_recov flag is present and set, then software can try + * to at least kill process to prolong system operation. + */ + if (mce_flags.overflow_recov) { + /* software can try to contain */ + if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL)) + return MCE_PANIC_SEVERITY; + + /* kill current process */ + return MCE_AR_SEVERITY; + } else { + /* at least one error was not logged */ + if (m->status & MCI_STATUS_OVER) + return MCE_PANIC_SEVERITY; + } + + /* + * For any other case, return MCE_UC_SEVERITY so that we log the + * error and exit #MC handler. + */ + return MCE_UC_SEVERITY; + } + + /* + * deferred error: poll handler catches these and adds to mce_ring so + * memory-failure can take recovery actions. + */ + if (m->status & MCI_STATUS_DEFERRED) + return MCE_DEFERRED_SEVERITY; + + /* + * corrected error: poll handler catches these and passes responsibility + * of decoding the error to EDAC + */ + return MCE_KEEP_SEVERITY; +} + +static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp) { enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP); enum context ctx = error_context(m); @@ -216,6 +270,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp) } } +/* Default to mce_severity_intel */ +int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) = + mce_severity_intel; + +void __init mcheck_vendor_init_severity(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + mce_severity = mce_severity_amd; +} + #ifdef CONFIG_DEBUG_FS static void *s_start(struct seq_file *f, loff_t *pos) { diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 3c036cb..e535533 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -60,11 +60,12 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); #define CREATE_TRACE_POINTS #include <trace/events/mce.h> -#define SPINUNIT 100 /* 100ns */ +#define SPINUNIT 100 /* 100ns */ DEFINE_PER_CPU(unsigned, mce_exception_count); struct mce_bank *mce_banks __read_mostly; +struct mce_vendor_flags mce_flags __read_mostly; struct mca_config mca_cfg __read_mostly = { .bootlog = -1, @@ -89,9 +90,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait); static DEFINE_PER_CPU(struct mce, mces_seen); static int cpu_missing; -/* CMCI storm detection filter */ -static DEFINE_PER_CPU(unsigned long, mce_polled_error); - /* * MCA banks polled by the period polling timer for corrected events. * With Intel CMCI, this only has MCA banks which do not support CMCI (if any). @@ -622,8 +620,9 @@ DEFINE_PER_CPU(unsigned, mce_poll_count); * is already totally * confused. In this case it's likely it will * not fully execute the machine check handler either. */ -void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) +bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b) { + bool error_logged = false; struct mce m; int severity; int i; @@ -646,7 +645,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) if (!(m.status & MCI_STATUS_VAL)) continue; - this_cpu_write(mce_polled_error, 1); + /* * Uncorrected or signalled events are handled by the exception * handler when it is enabled, so don't process those here. @@ -679,8 +678,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) * Don't get the IP here because it's unlikely to * have anything to do with the actual error location. */ - if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) + if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) { + error_logged = true; mce_log(&m); + } /* * Clear state for this bank. @@ -694,6 +695,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b) */ sync_core(); + + return error_logged; } EXPORT_SYMBOL_GPL(machine_check_poll); @@ -813,7 +816,7 @@ static void mce_reign(void) * other CPUs. */ if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3) - mce_panic("Fatal Machine check", m, msg); + mce_panic("Fatal machine check", m, msg); /* * For UC somewhere we let the CPU who detects it handle it. @@ -826,7 +829,7 @@ static void mce_reign(void) * source or one CPU is hung. Panic. */ if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3) - mce_panic("Machine check from unknown source", NULL, NULL); + mce_panic("Fatal machine check from unknown source", NULL, NULL); /* * Now clear all the mces_seen so that they don't reappear on @@ -1258,7 +1261,7 @@ void mce_log_therm_throt_event(__u64 status) * poller finds an MCE, poll 2x faster. When the poller finds no more * errors, poll 2x slower (up to check_interval seconds). */ -static unsigned long check_interval = 5 * 60; /* 5 minutes */ +static unsigned long check_interval = INITIAL_CHECK_INTERVAL; static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */ static DEFINE_PER_CPU(struct timer_list, mce_timer); @@ -1268,49 +1271,57 @@ static unsigned long mce_adjust_timer_default(unsigned long interval) return interval; } -static unsigned long (*mce_adjust_timer)(unsigned long interval) = - mce_adjust_timer_default; +static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default; -static int cmc_error_seen(void) +static void __restart_timer(struct timer_list *t, unsigned long interval) { - unsigned long *v = this_cpu_ptr(&mce_polled_error); + unsigned long when = jiffies + interval; + unsigned long flags; + + local_irq_save(flags); - return test_and_clear_bit(0, v); + if (timer_pending(t)) { + if (time_before(when, t->expires)) + mod_timer_pinned(t, when); + } else { + t->expires = round_jiffies(when); + add_timer_on(t, smp_processor_id()); + } + + local_irq_restore(flags); } static void mce_timer_fn(unsigned long data) { struct timer_list *t = this_cpu_ptr(&mce_timer); + int cpu = smp_processor_id(); unsigned long iv; - int notify; - WARN_ON(smp_processor_id() != data); + WARN_ON(cpu != data); + + iv = __this_cpu_read(mce_next_interval); if (mce_available(this_cpu_ptr(&cpu_info))) { - machine_check_poll(MCP_TIMESTAMP, - this_cpu_ptr(&mce_poll_banks)); - mce_intel_cmci_poll(); + machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks)); + + if (mce_intel_cmci_poll()) { + iv = mce_adjust_timer(iv); + goto done; + } } /* - * Alert userspace if needed. If we logged an MCE, reduce the - * polling interval, otherwise increase the polling interval. + * Alert userspace if needed. If we logged an MCE, reduce the polling + * interval, otherwise increase the polling interval. */ - iv = __this_cpu_read(mce_next_interval); - notify = mce_notify_irq(); - notify |= cmc_error_seen(); - if (notify) { + if (mce_notify_irq()) iv = max(iv / 2, (unsigned long) HZ/100); - } else { + else iv = min(iv * 2, round_jiffies_relative(check_interval * HZ)); - iv = mce_adjust_timer(iv); - } + +done: __this_cpu_write(mce_next_interval, iv); - /* Might have become 0 after CMCI storm subsided */ - if (iv) { - t->expires = jiffies + iv; - add_timer_on(t, smp_processor_id()); - } + __restart_timer(t, iv); } /* @@ -1319,16 +1330,10 @@ static void mce_timer_fn(unsigned long data) void mce_timer_kick(unsigned long interval) { struct timer_list *t = this_cpu_ptr(&mce_timer); - unsigned long when = jiffies + interval; unsigned long iv = __this_cpu_read(mce_next_interval); - if (timer_pending(t)) { - if (time_before(when, t->expires)) - mod_timer_pinned(t, when); - } else { - t->expires = round_jiffies(when); - add_timer_on(t, smp_processor_id()); - } + __restart_timer(t, interval); + if (interval < iv) __this_cpu_write(mce_next_interval, interval); } @@ -1525,45 +1530,46 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) * Various K7s with broken bank 0 around. Always disable * by default. */ - if (c->x86 == 6 && cfg->banks > 0) + if (c->x86 == 6 && cfg->banks > 0) mce_banks[0].ctl = 0; - /* - * Turn off MC4_MISC thresholding banks on those models since - * they're not supported there. - */ - if (c->x86 == 0x15 && - (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { - int i; - u64 val, hwcr; - bool need_toggle; - u32 msrs[] = { + /* + * overflow_recov is supported for F15h Models 00h-0fh + * even though we don't have a CPUID bit for it. + */ + if (c->x86 == 0x15 && c->x86_model <= 0xf) + mce_flags.overflow_recov = 1; + + /* + * Turn off MC4_MISC thresholding banks on those models since + * they're not supported there. + */ + if (c->x86 == 0x15 && + (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) { + int i; + u64 hwcr; + bool need_toggle; + u32 msrs[] = { 0x00000413, /* MC4_MISC0 */ 0xc0000408, /* MC4_MISC1 */ - }; + }; - rdmsrl(MSR_K7_HWCR, hwcr); + rdmsrl(MSR_K7_HWCR, hwcr); - /* McStatusWrEn has to be set */ - need_toggle = !(hwcr & BIT(18)); + /* McStatusWrEn has to be set */ + need_toggle = !(hwcr & BIT(18)); - if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr | BIT(18)); - for (i = 0; i < ARRAY_SIZE(msrs); i++) { - rdmsrl(msrs[i], val); + /* Clear CntP bit safely */ + for (i = 0; i < ARRAY_SIZE(msrs); i++) + msr_clear_bit(msrs[i], 62); - /* CntP bit set? */ - if (val & BIT_64(62)) { - val &= ~BIT_64(62); - wrmsrl(msrs[i], val); - } - } - - /* restore old settings */ - if (need_toggle) - wrmsrl(MSR_K7_HWCR, hwcr); - } + /* restore old settings */ + if (need_toggle) + wrmsrl(MSR_K7_HWCR, hwcr); + } } if (c->x86_vendor == X86_VENDOR_INTEL) { @@ -1629,10 +1635,11 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c) switch (c->x86_vendor) { case X86_VENDOR_INTEL: mce_intel_feature_init(c); - mce_adjust_timer = mce_intel_adjust_timer; + mce_adjust_timer = cmci_intel_adjust_timer; break; case X86_VENDOR_AMD: mce_amd_feature_init(c); + mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1; break; default: break; @@ -2017,6 +2024,7 @@ __setup("mce", mcheck_enable); int __init mcheck_init(void) { mcheck_intel_therm_init(); + mcheck_vendor_init_severity(); return 0; } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f1c3769..55ad9b3 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -79,7 +79,7 @@ static inline bool is_shared_bank(int bank) return (bank == 4); } -static const char * const bank4_names(struct threshold_block *b) +static const char *bank4_names(const struct threshold_block *b) { switch (b->address) { /* MSR4_MISC0 */ @@ -250,6 +250,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (!b.interrupt_capable) goto init; + b.interrupt_enable = 1; new = (high & MASK_LVTOFF_HI) >> 20; offset = setup_APIC_mce(offset, new); @@ -322,6 +323,8 @@ static void amd_threshold_interrupt(void) log: mce_setup(&m); rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status); + if (!(m.status & MCI_STATUS_VAL)) + return; m.misc = ((u64)high << 32) | low; m.bank = bank; mce_log(&m); @@ -497,10 +500,12 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, b->interrupt_capable = lvt_interrupt_supported(bank, high); b->threshold_limit = THRESHOLD_MAX; - if (b->interrupt_capable) + if (b->interrupt_capable) { threshold_ktype.default_attrs[2] = &interrupt_enable.attr; - else + b->interrupt_enable = 1; + } else { threshold_ktype.default_attrs[2] = NULL; + } INIT_LIST_HEAD(&b->miscj); diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c index b3c97ba..b4a41cf 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c @@ -39,6 +39,15 @@ static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned); /* + * CMCI storm detection backoff counter + * + * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've + * encountered an error. If not, we decrement it by one. We signal the end of + * the CMCI storm when it reaches 0. + */ +static DEFINE_PER_CPU(int, cmci_backoff_cnt); + +/* * cmci_discover_lock protects against parallel discovery attempts * which could race against each other. */ @@ -46,7 +55,7 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock); #define CMCI_THRESHOLD 1 #define CMCI_POLL_INTERVAL (30 * HZ) -#define CMCI_STORM_INTERVAL (1 * HZ) +#define CMCI_STORM_INTERVAL (HZ) #define CMCI_STORM_THRESHOLD 15 static DEFINE_PER_CPU(unsigned long, cmci_time_stamp); @@ -82,11 +91,21 @@ static int cmci_supported(int *banks) return !!(cap & MCG_CMCI_P); } -void mce_intel_cmci_poll(void) +bool mce_intel_cmci_poll(void) { if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE) - return; - machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); + return false; + + /* + * Reset the counter if we've logged an error in the last poll + * during the storm. + */ + if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned))) + this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); + else + this_cpu_dec(cmci_backoff_cnt); + + return true; } void mce_intel_hcpu_update(unsigned long cpu) @@ -97,31 +116,32 @@ void mce_intel_hcpu_update(unsigned long cpu) per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE; } -unsigned long mce_intel_adjust_timer(unsigned long interval) +unsigned long cmci_intel_adjust_timer(unsigned long interval) { - int r; - - if (interval < CMCI_POLL_INTERVAL) - return interval; + if ((this_cpu_read(cmci_backoff_cnt) > 0) && + (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) { + mce_notify_irq(); + return CMCI_STORM_INTERVAL; + } switch (__this_cpu_read(cmci_storm_state)) { case CMCI_STORM_ACTIVE: + /* * We switch back to interrupt mode once the poll timer has - * silenced itself. That means no events recorded and the - * timer interval is back to our poll interval. + * silenced itself. That means no events recorded and the timer + * interval is back to our poll interval. */ __this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED); - r = atomic_sub_return(1, &cmci_storm_on_cpus); - if (r == 0) + if (!atomic_sub_return(1, &cmci_storm_on_cpus)) pr_notice("CMCI storm subsided: switching to interrupt mode\n"); + /* FALLTHROUGH */ case CMCI_STORM_SUBSIDED: /* - * We wait for all cpus to go back to SUBSIDED - * state. When that happens we switch back to - * interrupt mode. + * We wait for all CPUs to go back to SUBSIDED state. When that + * happens we switch back to interrupt mode. */ if (!atomic_read(&cmci_storm_on_cpus)) { __this_cpu_write(cmci_storm_state, CMCI_STORM_NONE); @@ -130,10 +150,8 @@ unsigned long mce_intel_adjust_timer(unsigned long interval) } return CMCI_POLL_INTERVAL; default: - /* - * We have shiny weather. Let the poll do whatever it - * thinks. - */ + + /* We have shiny weather. Let the poll do whatever it thinks. */ return interval; } } @@ -178,7 +196,8 @@ static bool cmci_storm_detect(void) cmci_storm_disable_banks(); __this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE); r = atomic_add_return(1, &cmci_storm_on_cpus); - mce_timer_kick(CMCI_POLL_INTERVAL); + mce_timer_kick(CMCI_STORM_INTERVAL); + this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL); if (r == 1) pr_notice("CMCI storm detected: switching to poll mode\n"); @@ -195,6 +214,7 @@ static void intel_threshold_interrupt(void) { if (cmci_storm_detect()) return; + machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); mce_notify_irq(); } @@ -286,6 +306,7 @@ void cmci_recheck(void) if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks)) return; + local_irq_save(flags); machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)); local_irq_restore(flags); diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c index bfbbe61..12829c3 100644 --- a/arch/x86/kernel/cpu/microcode/amd.c +++ b/arch/x86/kernel/cpu/microcode/amd.c @@ -21,7 +21,6 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #include <linux/firmware.h> -#include <linux/pci_ids.h> #include <linux/uaccess.h> #include <linux/vmalloc.h> #include <linux/kernel.h> diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c index d45df4bd..a413a69 100644 --- a/arch/x86/kernel/cpu/microcode/core_early.c +++ b/arch/x86/kernel/cpu/microcode/core_early.c @@ -23,57 +23,6 @@ #include <asm/processor.h> #include <asm/cmdline.h> -#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24)) -#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u') -#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I') -#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l') -#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h') -#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i') -#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D') - -#define CPUID_IS(a, b, c, ebx, ecx, edx) \ - (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c)))) - -/* - * In early loading microcode phase on BSP, boot_cpu_data is not set up yet. - * x86_vendor() gets vendor id for BSP. - * - * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify - * coding, we still use x86_vendor() to get vendor id for AP. - * - * x86_vendor() gets vendor information directly through cpuid. - */ -static int x86_vendor(void) -{ - u32 eax = 0x00000000; - u32 ebx, ecx = 0, edx; - - native_cpuid(&eax, &ebx, &ecx, &edx); - - if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx)) - return X86_VENDOR_INTEL; - - if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx)) - return X86_VENDOR_AMD; - - return X86_VENDOR_UNKNOWN; -} - -static int x86_family(void) -{ - u32 eax = 0x00000001; - u32 ebx, ecx = 0, edx; - int x86; - - native_cpuid(&eax, &ebx, &ecx, &edx); - - x86 = (eax >> 8) & 0xf; - if (x86 == 15) - x86 += (eax >> 20) & 0xff; - - return x86; -} - static bool __init check_loader_disabled_bsp(void) { #ifdef CONFIG_X86_32 @@ -96,7 +45,7 @@ static bool __init check_loader_disabled_bsp(void) void __init load_ucode_bsp(void) { - int vendor, x86; + int vendor, family; if (check_loader_disabled_bsp()) return; @@ -105,15 +54,15 @@ void __init load_ucode_bsp(void) return; vendor = x86_vendor(); - x86 = x86_family(); + family = x86_family(); switch (vendor) { case X86_VENDOR_INTEL: - if (x86 >= 6) + if (family >= 6) load_ucode_intel_bsp(); break; case X86_VENDOR_AMD: - if (x86 >= 0x10) + if (family >= 0x10) load_ucode_amd_bsp(); break; default: @@ -132,7 +81,7 @@ static bool check_loader_disabled_ap(void) void load_ucode_ap(void) { - int vendor, x86; + int vendor, family; if (check_loader_disabled_ap()) return; @@ -141,15 +90,15 @@ void load_ucode_ap(void) return; vendor = x86_vendor(); - x86 = x86_family(); + family = x86_family(); switch (vendor) { case X86_VENDOR_INTEL: - if (x86 >= 6) + if (family >= 6) load_ucode_intel_ap(); break; case X86_VENDOR_AMD: - if (x86 >= 0x10) + if (family >= 0x10) load_ucode_amd_ap(); break; default: @@ -179,18 +128,18 @@ int __init save_microcode_in_initrd(void) void reload_early_microcode(void) { - int vendor, x86; + int vendor, family; vendor = x86_vendor(); - x86 = x86_family(); + family = x86_family(); switch (vendor) { case X86_VENDOR_INTEL: - if (x86 >= 6) + if (family >= 6) reload_ucode_intel(); break; case X86_VENDOR_AMD: - if (x86 >= 0x10) + if (family >= 0x10) reload_ucode_amd(); break; default: diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c index 746e7fd..a41bead 100644 --- a/arch/x86/kernel/cpu/microcode/intel.c +++ b/arch/x86/kernel/cpu/microcode/intel.c @@ -124,7 +124,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu) cpf = cpu_sig.pf; crev = cpu_sig.rev; - return get_matching_microcode(csig, cpf, mc_intel, crev); + return get_matching_microcode(csig, cpf, crev, mc_intel); } static int apply_microcode_intel(int cpu) @@ -226,7 +226,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, csig = uci->cpu_sig.sig; cpf = uci->cpu_sig.pf; - if (get_matching_microcode(csig, cpf, mc, new_rev)) { + if (get_matching_microcode(csig, cpf, new_rev, mc)) { vfree(new_mc); new_rev = mc_header.rev; new_mc = mc; diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c index 420eb93..2f49ab4 100644 --- a/arch/x86/kernel/cpu/microcode/intel_early.c +++ b/arch/x86/kernel/cpu/microcode/intel_early.c @@ -16,6 +16,14 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ + +/* + * This needs to be before all headers so that pr_debug in printk.h doesn't turn + * printk calls into no_printk(). + * + *#define DEBUG + */ + #include <linux/module.h> #include <linux/mm.h> #include <linux/slab.h> @@ -28,6 +36,9 @@ #include <asm/tlbflush.h> #include <asm/setup.h> +#undef pr_fmt +#define pr_fmt(fmt) "microcode: " fmt + static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT]; static struct mc_saved_data { unsigned int mc_saved_count; @@ -35,50 +46,45 @@ static struct mc_saved_data { } mc_saved_data; static enum ucode_state -generic_load_microcode_early(struct microcode_intel **mc_saved_p, - unsigned int mc_saved_count, - struct ucode_cpu_info *uci) +load_microcode_early(struct microcode_intel **saved, + unsigned int num_saved, struct ucode_cpu_info *uci) { struct microcode_intel *ucode_ptr, *new_mc = NULL; - int new_rev = uci->cpu_sig.rev; - enum ucode_state state = UCODE_OK; - unsigned int mc_size; - struct microcode_header_intel *mc_header; - unsigned int csig = uci->cpu_sig.sig; - unsigned int cpf = uci->cpu_sig.pf; - int i; + struct microcode_header_intel *mc_hdr; + int new_rev, ret, i; - for (i = 0; i < mc_saved_count; i++) { - ucode_ptr = mc_saved_p[i]; + new_rev = uci->cpu_sig.rev; - mc_header = (struct microcode_header_intel *)ucode_ptr; - mc_size = get_totalsize(mc_header); - if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) { - new_rev = mc_header->rev; - new_mc = ucode_ptr; - } - } + for (i = 0; i < num_saved; i++) { + ucode_ptr = saved[i]; + mc_hdr = (struct microcode_header_intel *)ucode_ptr; - if (!new_mc) { - state = UCODE_NFOUND; - goto out; + ret = get_matching_microcode(uci->cpu_sig.sig, + uci->cpu_sig.pf, + new_rev, + ucode_ptr); + if (!ret) + continue; + + new_rev = mc_hdr->rev; + new_mc = ucode_ptr; } + if (!new_mc) + return UCODE_NFOUND; + uci->mc = (struct microcode_intel *)new_mc; -out: - return state; + return UCODE_OK; } -static void -microcode_pointer(struct microcode_intel **mc_saved, - unsigned long *mc_saved_in_initrd, - unsigned long initrd_start, int mc_saved_count) +static inline void +copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd, + unsigned long off, int num_saved) { int i; - for (i = 0; i < mc_saved_count; i++) - mc_saved[i] = (struct microcode_intel *) - (mc_saved_in_initrd[i] + initrd_start); + for (i = 0; i < num_saved; i++) + mc_saved[i] = (struct microcode_intel *)(initrd[i] + off); } #ifdef CONFIG_X86_32 @@ -102,55 +108,27 @@ microcode_phys(struct microcode_intel **mc_saved_tmp, #endif static enum ucode_state -load_microcode(struct mc_saved_data *mc_saved_data, - unsigned long *mc_saved_in_initrd, - unsigned long initrd_start, - struct ucode_cpu_info *uci) +load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, + unsigned long initrd_start, struct ucode_cpu_info *uci) { struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT]; unsigned int count = mc_saved_data->mc_saved_count; if (!mc_saved_data->mc_saved) { - microcode_pointer(mc_saved_tmp, mc_saved_in_initrd, - initrd_start, count); + copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count); - return generic_load_microcode_early(mc_saved_tmp, count, uci); + return load_microcode_early(mc_saved_tmp, count, uci); } else { #ifdef CONFIG_X86_32 microcode_phys(mc_saved_tmp, mc_saved_data); - return generic_load_microcode_early(mc_saved_tmp, count, uci); + return load_microcode_early(mc_saved_tmp, count, uci); #else - return generic_load_microcode_early(mc_saved_data->mc_saved, + return load_microcode_early(mc_saved_data->mc_saved, count, uci); #endif } } -static u8 get_x86_family(unsigned long sig) -{ - u8 x86; - - x86 = (sig >> 8) & 0xf; - - if (x86 == 0xf) - x86 += (sig >> 20) & 0xff; - - return x86; -} - -static u8 get_x86_model(unsigned long sig) -{ - u8 x86, x86_model; - - x86 = get_x86_family(sig); - x86_model = (sig >> 4) & 0xf; - - if (x86 == 0x6 || x86 == 0xf) - x86_model += ((sig >> 16) & 0xf) << 4; - - return x86_model; -} - /* * Given CPU signature and a microcode patch, this function finds if the * microcode patch has matching family and model with the CPU. @@ -159,42 +137,40 @@ static enum ucode_state matching_model_microcode(struct microcode_header_intel *mc_header, unsigned long sig) { - u8 x86, x86_model; - u8 x86_ucode, x86_model_ucode; + unsigned int fam, model; + unsigned int fam_ucode, model_ucode; struct extended_sigtable *ext_header; unsigned long total_size = get_totalsize(mc_header); unsigned long data_size = get_datasize(mc_header); int ext_sigcount, i; struct extended_signature *ext_sig; - x86 = get_x86_family(sig); - x86_model = get_x86_model(sig); + fam = __x86_family(sig); + model = x86_model(sig); - x86_ucode = get_x86_family(mc_header->sig); - x86_model_ucode = get_x86_model(mc_header->sig); + fam_ucode = __x86_family(mc_header->sig); + model_ucode = x86_model(mc_header->sig); - if (x86 == x86_ucode && x86_model == x86_model_ucode) + if (fam == fam_ucode && model == model_ucode) return UCODE_OK; /* Look for ext. headers: */ if (total_size <= data_size + MC_HEADER_SIZE) return UCODE_NFOUND; - ext_header = (struct extended_sigtable *) - mc_header + data_size + MC_HEADER_SIZE; + ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE; + ext_sig = (void *)ext_header + EXT_HEADER_SIZE; ext_sigcount = ext_header->count; - ext_sig = (void *)ext_header + EXT_HEADER_SIZE; for (i = 0; i < ext_sigcount; i++) { - x86_ucode = get_x86_family(ext_sig->sig); - x86_model_ucode = get_x86_model(ext_sig->sig); + fam_ucode = __x86_family(ext_sig->sig); + model_ucode = x86_model(ext_sig->sig); - if (x86 == x86_ucode && x86_model == x86_model_ucode) + if (fam == fam_ucode && model == model_ucode) return UCODE_OK; ext_sig++; } - return UCODE_NFOUND; } @@ -204,7 +180,7 @@ save_microcode(struct mc_saved_data *mc_saved_data, unsigned int mc_saved_count) { int i, j; - struct microcode_intel **mc_saved_p; + struct microcode_intel **saved_ptr; int ret; if (!mc_saved_count) @@ -213,39 +189,45 @@ save_microcode(struct mc_saved_data *mc_saved_data, /* * Copy new microcode data. */ - mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *), - GFP_KERNEL); - if (!mc_saved_p) + saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL); + if (!saved_ptr) return -ENOMEM; for (i = 0; i < mc_saved_count; i++) { - struct microcode_intel *mc = mc_saved_src[i]; - struct microcode_header_intel *mc_header = &mc->hdr; - unsigned long mc_size = get_totalsize(mc_header); - mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL); - if (!mc_saved_p[i]) { - ret = -ENOMEM; - goto err; - } + struct microcode_header_intel *mc_hdr; + struct microcode_intel *mc; + unsigned long size; + if (!mc_saved_src[i]) { ret = -EINVAL; goto err; } - memcpy(mc_saved_p[i], mc, mc_size); + + mc = mc_saved_src[i]; + mc_hdr = &mc->hdr; + size = get_totalsize(mc_hdr); + + saved_ptr[i] = kmalloc(size, GFP_KERNEL); + if (!saved_ptr[i]) { + ret = -ENOMEM; + goto err; + } + + memcpy(saved_ptr[i], mc, size); } /* * Point to newly saved microcode. */ - mc_saved_data->mc_saved = mc_saved_p; + mc_saved_data->mc_saved = saved_ptr; mc_saved_data->mc_saved_count = mc_saved_count; return 0; err: for (j = 0; j <= i; j++) - kfree(mc_saved_p[j]); - kfree(mc_saved_p); + kfree(saved_ptr[j]); + kfree(saved_ptr); return ret; } @@ -257,48 +239,45 @@ err: * - or if it is a newly discovered microcode patch. * * The microcode patch should have matching model with CPU. + * + * Returns: The updated number @num_saved of saved microcode patches. */ -static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr, - unsigned int *mc_saved_count_p) +static unsigned int _save_mc(struct microcode_intel **mc_saved, + u8 *ucode_ptr, unsigned int num_saved) { - int i; - int found = 0; - unsigned int mc_saved_count = *mc_saved_count_p; - struct microcode_header_intel *mc_header; + struct microcode_header_intel *mc_hdr, *mc_saved_hdr; + unsigned int sig, pf, new_rev; + int found = 0, i; + + mc_hdr = (struct microcode_header_intel *)ucode_ptr; + + for (i = 0; i < num_saved; i++) { + mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i]; + sig = mc_saved_hdr->sig; + pf = mc_saved_hdr->pf; + new_rev = mc_hdr->rev; + + if (!get_matching_sig(sig, pf, new_rev, ucode_ptr)) + continue; + + found = 1; + + if (!revision_is_newer(mc_hdr, new_rev)) + continue; - mc_header = (struct microcode_header_intel *)ucode_ptr; - for (i = 0; i < mc_saved_count; i++) { - unsigned int sig, pf; - unsigned int new_rev; - struct microcode_header_intel *mc_saved_header = - (struct microcode_header_intel *)mc_saved[i]; - sig = mc_saved_header->sig; - pf = mc_saved_header->pf; - new_rev = mc_header->rev; - - if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) { - found = 1; - if (update_match_revision(mc_header, new_rev)) { - /* - * Found an older ucode saved before. - * Replace the older one with this newer - * one. - */ - mc_saved[i] = - (struct microcode_intel *)ucode_ptr; - break; - } - } - } - if (i >= mc_saved_count && !found) /* - * This ucode is first time discovered in ucode file. - * Save it to memory. + * Found an older ucode saved earlier. Replace it with + * this newer one. */ - mc_saved[mc_saved_count++] = - (struct microcode_intel *)ucode_ptr; + mc_saved[i] = (struct microcode_intel *)ucode_ptr; + break; + } + + /* Newly detected microcode, save it to memory. */ + if (i >= num_saved && !found) + mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr; - *mc_saved_count_p = mc_saved_count; + return num_saved; } /* @@ -346,7 +325,7 @@ get_matching_model_microcode(int cpu, unsigned long start, continue; } - _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count); + mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count); ucode_ptr += mc_size; } @@ -372,7 +351,7 @@ out: static int collect_cpu_info_early(struct ucode_cpu_info *uci) { unsigned int val[2]; - u8 x86, x86_model; + unsigned int family, model; struct cpu_signature csig; unsigned int eax, ebx, ecx, edx; @@ -387,10 +366,10 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci) native_cpuid(&eax, &ebx, &ecx, &edx); csig.sig = eax; - x86 = get_x86_family(csig.sig); - x86_model = get_x86_model(csig.sig); + family = __x86_family(csig.sig); + model = x86_model(csig.sig); - if ((x86_model >= 5) || (x86 > 6)) { + if ((model >= 5) || (family > 6)) { /* get processor flags from MSR 0x17 */ native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]); csig.pf = 1 << ((val[1] >> 18) & 7); @@ -429,8 +408,7 @@ static void __ref show_saved_mc(void) sig = uci.cpu_sig.sig; pf = uci.cpu_sig.pf; rev = uci.cpu_sig.rev; - pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n", - smp_processor_id(), sig, pf, rev); + pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev); for (i = 0; i < mc_saved_data.mc_saved_count; i++) { struct microcode_header_intel *mc_saved_header; @@ -457,8 +435,7 @@ static void __ref show_saved_mc(void) if (total_size <= data_size + MC_HEADER_SIZE) continue; - ext_header = (struct extended_sigtable *) - mc_saved_header + data_size + MC_HEADER_SIZE; + ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE; ext_sigcount = ext_header->count; ext_sig = (void *)ext_header + EXT_HEADER_SIZE; @@ -515,8 +492,7 @@ int save_mc_for_early(u8 *mc) * Save the microcode patch mc in mc_save_tmp structure if it's a newer * version. */ - - _save_mc(mc_saved_tmp, mc, &mc_saved_count); + mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count); /* * Save the mc_save_tmp in global mc_saved_data. @@ -548,12 +524,10 @@ EXPORT_SYMBOL_GPL(save_mc_for_early); static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin"; static __init enum ucode_state -scan_microcode(unsigned long start, unsigned long end, - struct mc_saved_data *mc_saved_data, - unsigned long *mc_saved_in_initrd, - struct ucode_cpu_info *uci) +scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd, + unsigned long start, unsigned long size, + struct ucode_cpu_info *uci) { - unsigned int size = end - start + 1; struct cpio_data cd; long offset = 0; #ifdef CONFIG_X86_32 @@ -569,10 +543,8 @@ scan_microcode(unsigned long start, unsigned long end, if (!cd.data) return UCODE_ERROR; - return get_matching_model_microcode(0, start, cd.data, cd.size, - mc_saved_data, mc_saved_in_initrd, - uci); + mc_saved_data, initrd, uci); } /* @@ -704,7 +676,7 @@ int __init save_microcode_in_initrd_intel(void) if (count == 0) return ret; - microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count); + copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count); ret = save_microcode(&mc_saved_data, mc_saved, count); if (ret) pr_err("Cannot save microcode patches from initrd.\n"); @@ -716,52 +688,44 @@ int __init save_microcode_in_initrd_intel(void) static void __init _load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data, - unsigned long *mc_saved_in_initrd, - unsigned long initrd_start_early, - unsigned long initrd_end_early, - struct ucode_cpu_info *uci) + unsigned long *initrd, + unsigned long start, unsigned long size) { + struct ucode_cpu_info uci; enum ucode_state ret; - collect_cpu_info_early(uci); - scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data, - mc_saved_in_initrd, uci); + collect_cpu_info_early(&uci); - ret = load_microcode(mc_saved_data, mc_saved_in_initrd, - initrd_start_early, uci); + ret = scan_microcode(mc_saved_data, initrd, start, size, &uci); + if (ret != UCODE_OK) + return; - if (ret == UCODE_OK) - apply_microcode_early(uci, true); + ret = load_microcode(mc_saved_data, initrd, start, &uci); + if (ret != UCODE_OK) + return; + + apply_microcode_early(&uci, true); } -void __init -load_ucode_intel_bsp(void) +void __init load_ucode_intel_bsp(void) { - u64 ramdisk_image, ramdisk_size; - unsigned long initrd_start_early, initrd_end_early; - struct ucode_cpu_info uci; + u64 start, size; #ifdef CONFIG_X86_32 - struct boot_params *boot_params_p; + struct boot_params *p; - boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params); - ramdisk_image = boot_params_p->hdr.ramdisk_image; - ramdisk_size = boot_params_p->hdr.ramdisk_size; - initrd_start_early = ramdisk_image; - initrd_end_early = initrd_start_early + ramdisk_size; + p = (struct boot_params *)__pa_nodebug(&boot_params); + start = p->hdr.ramdisk_image; + size = p->hdr.ramdisk_size; _load_ucode_intel_bsp( - (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), - (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), - initrd_start_early, initrd_end_early, &uci); + (struct mc_saved_data *)__pa_nodebug(&mc_saved_data), + (unsigned long *)__pa_nodebug(&mc_saved_in_initrd), + start, size); #else - ramdisk_image = boot_params.hdr.ramdisk_image; - ramdisk_size = boot_params.hdr.ramdisk_size; - initrd_start_early = ramdisk_image + PAGE_OFFSET; - initrd_end_early = initrd_start_early + ramdisk_size; - - _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, - initrd_start_early, initrd_end_early, - &uci); + start = boot_params.hdr.ramdisk_image + PAGE_OFFSET; + size = boot_params.hdr.ramdisk_size; + + _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size); #endif } @@ -771,6 +735,7 @@ void load_ucode_intel_ap(void) struct ucode_cpu_info uci; unsigned long *mc_saved_in_initrd_p; unsigned long initrd_start_addr; + enum ucode_state ret; #ifdef CONFIG_X86_32 unsigned long *initrd_start_p; @@ -793,8 +758,12 @@ void load_ucode_intel_ap(void) return; collect_cpu_info_early(&uci); - load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, - initrd_start_addr, &uci); + ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p, + initrd_start_addr, &uci); + + if (ret != UCODE_OK) + return; + apply_microcode_early(&uci, true); } @@ -808,8 +777,8 @@ void reload_ucode_intel(void) collect_cpu_info_early(&uci); - ret = generic_load_microcode_early(mc_saved_data.mc_saved, - mc_saved_data.mc_saved_count, &uci); + ret = load_microcode_early(mc_saved_data.mc_saved, + mc_saved_data.mc_saved_count, &uci); if (ret != UCODE_OK) return; diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c index ce69320..cd47a51 100644 --- a/arch/x86/kernel/cpu/microcode/intel_lib.c +++ b/arch/x86/kernel/cpu/microcode/intel_lib.c @@ -38,12 +38,6 @@ update_match_cpu(unsigned int csig, unsigned int cpf, return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1; } -int -update_match_revision(struct microcode_header_intel *mc_header, int rev) -{ - return (mc_header->rev <= rev) ? 0 : 1; -} - int microcode_sanity_check(void *mc, int print_err) { unsigned long total_size, data_size, ext_table_size; @@ -128,10 +122,9 @@ int microcode_sanity_check(void *mc, int print_err) EXPORT_SYMBOL_GPL(microcode_sanity_check); /* - * return 0 - no update found - * return 1 - found update + * Returns 1 if update has been found, 0 otherwise. */ -int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev) +int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc) { struct microcode_header_intel *mc_header = mc; struct extended_sigtable *ext_header; @@ -159,16 +152,15 @@ int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev) } /* - * return 0 - no update found - * return 1 - found update + * Returns 1 if update has been found, 0 otherwise. */ -int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev) +int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc) { - struct microcode_header_intel *mc_header = mc; + struct microcode_header_intel *mc_hdr = mc; - if (!update_match_revision(mc_header, rev)) + if (!revision_is_newer(mc_hdr, rev)) return 0; - return get_matching_sig(csig, cpf, mc, rev); + return get_matching_sig(csig, cpf, rev, mc); } EXPORT_SYMBOL_GPL(get_matching_microcode); diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh index 36d99a3..3f20710 100644 --- a/arch/x86/kernel/cpu/mkcapflags.sh +++ b/arch/x86/kernel/cpu/mkcapflags.sh @@ -6,7 +6,7 @@ IN=$1 OUT=$2 -function dump_array() +dump_array() { ARRAY=$1 SIZE=$2 diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 689e357..87848eb 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2236,24 +2236,24 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) static unsigned long code_segment_base(struct pt_regs *regs) { /* + * For IA32 we look at the GDT/LDT segment base to convert the + * effective IP to a linear address. + */ + +#ifdef CONFIG_X86_32 + /* * If we are in VM86 mode, add the segment offset to convert to a * linear address. */ if (regs->flags & X86_VM_MASK) return 0x10 * regs->cs; - /* - * For IA32 we look at the GDT/LDT segment base to convert the - * effective IP to a linear address. - */ -#ifdef CONFIG_X86_32 if (user_mode(regs) && regs->cs != __USER_CS) return get_segment_base(regs->cs); #else - if (test_thread_flag(TIF_IA32)) { - if (user_mode(regs) && regs->cs != __USER32_CS) - return get_segment_base(regs->cs); - } + if (user_mode(regs) && !user_64bit_mode(regs) && + regs->cs != __USER32_CS) + return get_segment_base(regs->cs); #endif return 0; } diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index aceb2f9..c76d3e3 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -105,7 +105,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs) #ifdef CONFIG_X86_32 struct pt_regs fixed_regs; - if (!user_mode_vm(regs)) { + if (!user_mode(regs)) { crash_fixup_ss_esp(&fixed_regs, regs); regs = &fixed_regs; } diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c index 3d35033..6367a78 100644 --- a/arch/x86/kernel/devicetree.c +++ b/arch/x86/kernel/devicetree.c @@ -286,13 +286,13 @@ static void __init x86_flattree_get_config(void) initial_boot_params = dt = early_memremap(initial_dtb, map_len); size = of_get_flat_dt_size(); if (map_len < size) { - early_iounmap(dt, map_len); + early_memunmap(dt, map_len); initial_boot_params = dt = early_memremap(initial_dtb, size); map_len = size; } unflatten_and_copy_device_tree(); - early_iounmap(dt, map_len); + early_memunmap(dt, map_len); } #else static inline void x86_flattree_get_config(void) { } diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index cf3df1d..9c30acf 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -25,10 +25,12 @@ unsigned int code_bytes = 64; int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE; static int die_counter; -static void printk_stack_address(unsigned long address, int reliable) +static void printk_stack_address(unsigned long address, int reliable, + void *data) { - pr_cont(" [<%p>] %s%pB\n", - (void *)address, reliable ? "" : "? ", (void *)address); + printk("%s [<%p>] %s%pB\n", + (char *)data, (void *)address, reliable ? "" : "? ", + (void *)address); } void printk_address(unsigned long address) @@ -155,8 +157,7 @@ static int print_trace_stack(void *data, char *name) static void print_trace_address(void *data, unsigned long addr, int reliable) { touch_nmi_watchdog(); - printk(data); - printk_stack_address(addr, reliable); + printk_stack_address(addr, reliable, data); } static const struct stacktrace_ops print_trace_ops = { @@ -278,7 +279,7 @@ int __die(const char *str, struct pt_regs *regs, long err) print_modules(); show_regs(regs); #ifdef CONFIG_X86_32 - if (user_mode_vm(regs)) { + if (user_mode(regs)) { sp = regs->sp; ss = regs->ss & 0xffff; } else { @@ -307,7 +308,7 @@ void die(const char *str, struct pt_regs *regs, long err) unsigned long flags = oops_begin(); int sig = SIGSEGV; - if (!user_mode_vm(regs)) + if (!user_mode(regs)) report_bug(regs->ip, regs); if (__die(str, regs, err)) diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c index 5abd4cd..464ffd6 100644 --- a/arch/x86/kernel/dumpstack_32.c +++ b/arch/x86/kernel/dumpstack_32.c @@ -108,9 +108,12 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, for (i = 0; i < kstack_depth_to_print; i++) { if (kstack_end(stack)) break; - if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - pr_cont("\n"); - pr_cont(" %08lx", *stack++); + if ((i % STACKSLOTS_PER_LINE) == 0) { + if (i != 0) + pr_cont("\n"); + printk("%s %08lx", log_lvl, *stack++); + } else + pr_cont(" %08lx", *stack++); touch_nmi_watchdog(); } pr_cont("\n"); @@ -123,13 +126,13 @@ void show_regs(struct pt_regs *regs) int i; show_regs_print_info(KERN_EMERG); - __show_regs(regs, !user_mode_vm(regs)); + __show_regs(regs, !user_mode(regs)); /* * When in-kernel, we also print out the stack and code at the * time of the fault.. */ - if (!user_mode_vm(regs)) { + if (!user_mode(regs)) { unsigned int code_prologue = code_bytes * 43 / 64; unsigned int code_len = code_bytes; unsigned char c; diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index ff86f19..5f1c626 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -280,12 +280,15 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, pr_cont(" <EOI> "); } } else { - if (((long) stack & (THREAD_SIZE-1)) == 0) + if (kstack_end(stack)) break; } - if (i && ((i % STACKSLOTS_PER_LINE) == 0)) - pr_cont("\n"); - pr_cont(" %016lx", *stack++); + if ((i % STACKSLOTS_PER_LINE) == 0) { + if (i != 0) + pr_cont("\n"); + printk("%s %016lx", log_lvl, *stack++); + } else + pr_cont(" %016lx", *stack++); touch_nmi_watchdog(); } preempt_enable(); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 46201de..7d46bb2 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -661,7 +661,7 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len) extmap = (struct e820entry *)(sdata->data); __append_e820_map(extmap, entries); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); - early_iounmap(sdata, data_len); + early_memunmap(sdata, data_len); printk(KERN_INFO "e820: extended physical RAM map:\n"); e820_print_map("extended"); } diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c index a62536a..49ff55e 100644 --- a/arch/x86/kernel/early_printk.c +++ b/arch/x86/kernel/early_printk.c @@ -95,20 +95,6 @@ static unsigned long early_serial_base = 0x3f8; /* ttyS0 */ #define DLL 0 /* Divisor Latch Low */ #define DLH 1 /* Divisor latch High */ -static void mem32_serial_out(unsigned long addr, int offset, int value) -{ - uint32_t *vaddr = (uint32_t *)addr; - /* shift implied by pointer type */ - writel(value, vaddr + offset); -} - -static unsigned int mem32_serial_in(unsigned long addr, int offset) -{ - uint32_t *vaddr = (uint32_t *)addr; - /* shift implied by pointer type */ - return readl(vaddr + offset); -} - static unsigned int io_serial_in(unsigned long addr, int offset) { return inb(addr + offset); @@ -205,6 +191,20 @@ static __init void early_serial_init(char *s) } #ifdef CONFIG_PCI +static void mem32_serial_out(unsigned long addr, int offset, int value) +{ + u32 *vaddr = (u32 *)addr; + /* shift implied by pointer type */ + writel(value, vaddr + offset); +} + +static unsigned int mem32_serial_in(unsigned long addr, int offset) +{ + u32 *vaddr = (u32 *)addr; + /* shift implied by pointer type */ + return readl(vaddr + offset); +} + /* * early_pci_serial_init() * @@ -217,8 +217,8 @@ static __init void early_pci_serial_init(char *s) unsigned divisor; unsigned long baud = DEFAULT_BAUD; u8 bus, slot, func; - uint32_t classcode, bar0; - uint16_t cmdreg; + u32 classcode, bar0; + u16 cmdreg; char *e; diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 31e2d5b..1c30976 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -395,10 +395,13 @@ sysenter_past_esp: /*CFI_REL_OFFSET cs, 0*/ /* * Push current_thread_info()->sysenter_return to the stack. - * A tiny bit of offset fixup is necessary - 4*4 means the 4 words - * pushed above; +8 corresponds to copy_thread's esp0 setting. + * A tiny bit of offset fixup is necessary: TI_sysenter_return + * is relative to thread_info, which is at the bottom of the + * kernel stack page. 4*4 means the 4 words pushed above; + * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack; + * and THREAD_SIZE takes us to the bottom. */ - pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp) + pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp) CFI_REL_OFFSET eip, 0 pushl_cfi %eax @@ -432,7 +435,7 @@ sysenter_after_call: TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testl $_TIF_ALLWORK_MASK, %ecx - jne sysexit_audit + jnz sysexit_audit sysenter_exit: /* if something modifies registers it must also disable sysexit */ movl PT_EIP(%esp), %edx @@ -460,7 +463,7 @@ sysenter_audit: sysexit_audit: testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jne syscall_exit_work + jnz syscall_exit_work TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_ANY) movl %eax,%edx /* second arg, syscall return value */ @@ -472,7 +475,7 @@ sysexit_audit: TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx - jne syscall_exit_work + jnz syscall_exit_work movl PT_EAX(%esp),%eax /* reload syscall return value */ jmp sysenter_exit #endif @@ -510,7 +513,7 @@ syscall_exit: TRACE_IRQS_OFF movl TI_flags(%ebp), %ecx testl $_TIF_ALLWORK_MASK, %ecx # current->work - jne syscall_exit_work + jnz syscall_exit_work restore_all: TRACE_IRQS_IRET @@ -612,7 +615,7 @@ work_notifysig: # deal with pending signals and #ifdef CONFIG_VM86 testl $X86_EFLAGS_VM, PT_EFLAGS(%esp) movl %esp, %eax - jne work_notifysig_v86 # returning to kernel-space or + jnz work_notifysig_v86 # returning to kernel-space or # vm86-space 1: #else @@ -720,43 +723,22 @@ END(sysenter_badsys) .endm /* - * Build the entry stubs and pointer table with some assembler magic. - * We pack 7 stubs into a single 32-byte chunk, which will fit in a - * single cache line on all modern x86 implementations. + * Build the entry stubs with some assembler magic. + * We pack 1 stub into every 8-byte block. */ -.section .init.rodata,"a" -ENTRY(interrupt) -.section .entry.text, "ax" - .p2align 5 - .p2align CONFIG_X86_L1_CACHE_SHIFT + .align 8 ENTRY(irq_entries_start) RING0_INT_FRAME -vector=FIRST_EXTERNAL_VECTOR -.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 - .balign 32 - .rept 7 - .if vector < FIRST_SYSTEM_VECTOR - .if vector <> FIRST_EXTERNAL_VECTOR + vector=FIRST_EXTERNAL_VECTOR + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ + vector=vector+1 + jmp common_interrupt CFI_ADJUST_CFA_OFFSET -4 - .endif -1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */ - .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 - jmp 2f - .endif - .previous - .long 1b - .section .entry.text, "ax" -vector=vector+1 - .endif - .endr -2: jmp common_interrupt -.endr + .align 8 + .endr END(irq_entries_start) -.previous -END(interrupt) -.previous - /* * the CPU automatically disables interrupts when executing an IRQ vector, * so IRQ-flags tracing has to follow that: @@ -816,15 +798,9 @@ ENTRY(simd_coprocessor_error) pushl_cfi $0 #ifdef CONFIG_X86_INVD_BUG /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ -661: pushl_cfi $do_general_protection -662: -.section .altinstructions,"a" - altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f -.previous -.section .altinstr_replacement,"ax" -663: pushl $do_simd_coprocessor_error -664: -.previous + ALTERNATIVE "pushl_cfi $do_general_protection", \ + "pushl $do_simd_coprocessor_error", \ + X86_FEATURE_XMM #else pushl_cfi $do_simd_coprocessor_error #endif @@ -1240,20 +1216,13 @@ error_code: /*CFI_REL_OFFSET es, 0*/ pushl_cfi %ds /*CFI_REL_OFFSET ds, 0*/ - pushl_cfi %eax - CFI_REL_OFFSET eax, 0 - pushl_cfi %ebp - CFI_REL_OFFSET ebp, 0 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %edx - CFI_REL_OFFSET edx, 0 - pushl_cfi %ecx - CFI_REL_OFFSET ecx, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl_cfi_reg eax + pushl_cfi_reg ebp + pushl_cfi_reg edi + pushl_cfi_reg esi + pushl_cfi_reg edx + pushl_cfi_reg ecx + pushl_cfi_reg ebx cld movl $(__KERNEL_PERCPU), %ecx movl %ecx, %fs diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 2babb39..c7b2384 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -14,27 +14,14 @@ * NOTE: This code handles signal-recognition, which happens every time * after an interrupt and after each system call. * - * Normal syscalls and interrupts don't save a full stack frame, this is - * only done for syscall tracing, signals or fork/exec et.al. - * * A note on terminology: - * - top of stack: Architecture defined interrupt frame from SS to RIP + * - iret frame: Architecture defined interrupt frame from SS to RIP * at the top of the kernel process stack. - * - partial stack frame: partially saved registers up to R11. - * - full stack frame: Like partial stack frame, but all register saved. * * Some macro usage: * - CFI macros are used to generate dwarf2 unwind information for better * backtraces. They don't change any code. - * - SAVE_ALL/RESTORE_ALL - Save/restore all registers - * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. - * There are unfortunately lots of special cases where some registers - * not touched. The macro is a big mess that should be cleaned up. - * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. - * Gives a full stack frame. * - ENTRY/END Define functions in the symbol table. - * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack - * frame that is otherwise undefined after a SYSCALL * - TRACE_IRQ_* - Trace hard interrupt state for lock debugging. * - idtentry - Define exception entry points. */ @@ -70,10 +57,6 @@ .section .entry.text, "ax" -#ifndef CONFIG_PREEMPT -#define retint_kernel retint_restore_args -#endif - #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret64) swapgs @@ -82,9 +65,9 @@ ENDPROC(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ -.macro TRACE_IRQS_IRETQ offset=ARGOFFSET +.macro TRACE_IRQS_IRETQ #ifdef CONFIG_TRACE_IRQFLAGS - bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ + bt $9,EFLAGS(%rsp) /* interrupts off? */ jnc 1f TRACE_IRQS_ON 1: @@ -116,8 +99,8 @@ ENDPROC(native_usergs_sysret64) call debug_stack_reset .endm -.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET - bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ +.macro TRACE_IRQS_IRETQ_DEBUG + bt $9,EFLAGS(%rsp) /* interrupts off? */ jnc 1f TRACE_IRQS_ON_DEBUG 1: @@ -130,34 +113,7 @@ ENDPROC(native_usergs_sysret64) #endif /* - * C code is not supposed to know about undefined top of stack. Every time - * a C function with an pt_regs argument is called from the SYSCALL based - * fast path FIXUP_TOP_OF_STACK is needed. - * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs - * manipulation. - */ - - /* %rsp:at FRAMEEND */ - .macro FIXUP_TOP_OF_STACK tmp offset=0 - movq PER_CPU_VAR(old_rsp),\tmp - movq \tmp,RSP+\offset(%rsp) - movq $__USER_DS,SS+\offset(%rsp) - movq $__USER_CS,CS+\offset(%rsp) - movq RIP+\offset(%rsp),\tmp /* get rip */ - movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */ - movq R11+\offset(%rsp),\tmp /* get eflags */ - movq \tmp,EFLAGS+\offset(%rsp) - .endm - - .macro RESTORE_TOP_OF_STACK tmp offset=0 - movq RSP+\offset(%rsp),\tmp - movq \tmp,PER_CPU_VAR(old_rsp) - movq EFLAGS+\offset(%rsp),\tmp - movq \tmp,R11+\offset(%rsp) - .endm - -/* - * initial frame state for interrupts (and exceptions without error code) + * empty frame */ .macro EMPTY_FRAME start=1 offset=0 .if \start @@ -173,12 +129,12 @@ ENDPROC(native_usergs_sysret64) * initial frame state for interrupts (and exceptions without error code) */ .macro INTR_FRAME start=1 offset=0 - EMPTY_FRAME \start, SS+8+\offset-RIP - /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ - CFI_REL_OFFSET rsp, RSP+\offset-RIP - /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ - /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ - CFI_REL_OFFSET rip, RIP+\offset-RIP + EMPTY_FRAME \start, 5*8+\offset + /*CFI_REL_OFFSET ss, 4*8+\offset*/ + CFI_REL_OFFSET rsp, 3*8+\offset + /*CFI_REL_OFFSET rflags, 2*8+\offset*/ + /*CFI_REL_OFFSET cs, 1*8+\offset*/ + CFI_REL_OFFSET rip, 0*8+\offset .endm /* @@ -186,30 +142,23 @@ ENDPROC(native_usergs_sysret64) * with vector already pushed) */ .macro XCPT_FRAME start=1 offset=0 - INTR_FRAME \start, RIP+\offset-ORIG_RAX - .endm - -/* - * frame that enables calling into C. - */ - .macro PARTIAL_FRAME start=1 offset=0 - XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET - CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET - CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET - CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET - CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET - CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET - CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET - CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET - CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET - CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET + INTR_FRAME \start, 1*8+\offset .endm /* * frame that enables passing a complete pt_regs to a C function. */ .macro DEFAULT_FRAME start=1 offset=0 - PARTIAL_FRAME \start, R11+\offset-R15 + XCPT_FRAME \start, ORIG_RAX+\offset + CFI_REL_OFFSET rdi, RDI+\offset + CFI_REL_OFFSET rsi, RSI+\offset + CFI_REL_OFFSET rdx, RDX+\offset + CFI_REL_OFFSET rcx, RCX+\offset + CFI_REL_OFFSET rax, RAX+\offset + CFI_REL_OFFSET r8, R8+\offset + CFI_REL_OFFSET r9, R9+\offset + CFI_REL_OFFSET r10, R10+\offset + CFI_REL_OFFSET r11, R11+\offset CFI_REL_OFFSET rbx, RBX+\offset CFI_REL_OFFSET rbp, RBP+\offset CFI_REL_OFFSET r12, R12+\offset @@ -218,105 +167,30 @@ ENDPROC(native_usergs_sysret64) CFI_REL_OFFSET r15, R15+\offset .endm -ENTRY(save_paranoid) - XCPT_FRAME 1 RDI+8 - cld - movq %rdi, RDI+8(%rsp) - movq %rsi, RSI+8(%rsp) - movq_cfi rdx, RDX+8 - movq_cfi rcx, RCX+8 - movq_cfi rax, RAX+8 - movq %r8, R8+8(%rsp) - movq %r9, R9+8(%rsp) - movq %r10, R10+8(%rsp) - movq %r11, R11+8(%rsp) - movq_cfi rbx, RBX+8 - movq %rbp, RBP+8(%rsp) - movq %r12, R12+8(%rsp) - movq %r13, R13+8(%rsp) - movq %r14, R14+8(%rsp) - movq %r15, R15+8(%rsp) - movl $1,%ebx - movl $MSR_GS_BASE,%ecx - rdmsr - testl %edx,%edx - js 1f /* negative -> in kernel */ - SWAPGS - xorl %ebx,%ebx -1: ret - CFI_ENDPROC -END(save_paranoid) - /* - * A newly forked process directly context switches into this address. + * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. * - * rdi: prev task we switched from - */ -ENTRY(ret_from_fork) - DEFAULT_FRAME - - LOCK ; btr $TIF_FORK,TI_flags(%r8) - - pushq_cfi $0x0002 - popfq_cfi # reset kernel eflags - - call schedule_tail # rdi: 'prev' task parameter - - GET_THREAD_INFO(%rcx) - - RESTORE_REST - - testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? - jz 1f - - /* - * By the time we get here, we have no idea whether our pt_regs, - * ti flags, and ti status came from the 64-bit SYSCALL fast path, - * the slow path, or one of the ia32entry paths. - * Use int_ret_from_sys_call to return, since it can safely handle - * all of the above. - */ - jmp int_ret_from_sys_call - -1: - subq $REST_SKIP, %rsp # leave space for volatiles - CFI_ADJUST_CFA_OFFSET REST_SKIP - movq %rbp, %rdi - call *%rbx - movl $0, RAX(%rsp) - RESTORE_REST - jmp int_ret_from_sys_call - CFI_ENDPROC -END(ret_from_fork) - -/* - * System call entry. Up to 6 arguments in registers are supported. + * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. * - * SYSCALL does not save anything on the stack and does not change the - * stack pointer. However, it does mask the flags register for us, so - * CLD and CLAC are not needed. - */ - -/* - * Register setup: + * Registers on entry: * rax system call number + * rcx return address + * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) * rdi arg0 - * rcx return address for syscall/sysret, C arg3 * rsi arg1 * rdx arg2 - * r10 arg3 (--> moved to rcx for C) + * r10 arg3 (needs to be moved to rcx to conform to C ABI) * r8 arg4 * r9 arg5 - * r11 eflags for syscall/sysret, temporary for C - * r12-r15,rbp,rbx saved by C code, not touched. + * (note: r12-r15,rbp,rbx are callee-preserved in C ABI) * - * Interrupts are off on entry. * Only called from user space. * - * XXX if we had a free scratch register we could save the RSP into the stack frame - * and report it properly in ps. Unfortunately we haven't. - * - * When user can change the frames always force IRET. That is because + * When user can change pt_regs->foo always force IRET. That is because * it deals with uncanonical addresses better. SYSRET has trouble * with them due to bugs in both AMD and Intel CPUs. */ @@ -324,9 +198,15 @@ END(ret_from_fork) ENTRY(system_call) CFI_STARTPROC simple CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET + CFI_DEF_CFA rsp,0 CFI_REGISTER rip,rcx /*CFI_REGISTER rflags,r11*/ + + /* + * Interrupts are off on entry. + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ SWAPGS_UNSAFE_STACK /* * A hypervisor implementation might want to use a label @@ -335,18 +215,38 @@ ENTRY(system_call) */ GLOBAL(system_call_after_swapgs) - movq %rsp,PER_CPU_VAR(old_rsp) + movq %rsp,PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(kernel_stack),%rsp + + /* Construct struct pt_regs on stack */ + pushq_cfi $__USER_DS /* pt_regs->ss */ + pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ /* - * No need to follow this irqs off/on section - it's straight - * and short: + * Re-enable interrupts. + * We use 'rsp_scratch' as a scratch space, hence irq-off block above + * must execute atomically in the face of possible interrupt-driven + * task preemption. We must enable interrupts only after we're done + * with using rsp_scratch: */ ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_ARGS 8, 0, rax_enosys=1 - movq_cfi rax,(ORIG_RAX-ARGOFFSET) - movq %rcx,RIP-ARGOFFSET(%rsp) - CFI_REL_OFFSET rip,RIP-ARGOFFSET - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + pushq_cfi %r11 /* pt_regs->flags */ + pushq_cfi $__USER_CS /* pt_regs->cs */ + pushq_cfi %rcx /* pt_regs->ip */ + CFI_REL_OFFSET rip,0 + pushq_cfi_reg rax /* pt_regs->orig_ax */ + pushq_cfi_reg rdi /* pt_regs->di */ + pushq_cfi_reg rsi /* pt_regs->si */ + pushq_cfi_reg rdx /* pt_regs->dx */ + pushq_cfi_reg rcx /* pt_regs->cx */ + pushq_cfi $-ENOSYS /* pt_regs->ax */ + pushq_cfi_reg r8 /* pt_regs->r8 */ + pushq_cfi_reg r9 /* pt_regs->r9 */ + pushq_cfi_reg r10 /* pt_regs->r10 */ + pushq_cfi_reg r11 /* pt_regs->r11 */ + sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */ + CFI_ADJUST_CFA_OFFSET 6*8 + + testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) jnz tracesys system_call_fastpath: #if __SYSCALL_MASK == ~0 @@ -355,18 +255,21 @@ system_call_fastpath: andl $__SYSCALL_MASK,%eax cmpl $__NR_syscall_max,%eax #endif - ja ret_from_sys_call /* and return regs->ax */ + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ movq %r10,%rcx - call *sys_call_table(,%rax,8) # XXX: rip relative - movq %rax,RAX-ARGOFFSET(%rsp) + call *sys_call_table(,%rax,8) + movq %rax,RAX(%rsp) +1: /* - * Syscall return path ending with SYSRET (fast path) - * Has incomplete stack frame and undefined top of stack. + * Syscall return path ending with SYSRET (fast path). + * Has incompletely filled pt_regs. */ -ret_from_sys_call: LOCKDEP_SYS_EXIT + /* + * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, + * it is too small to ever cause noticeable irq latency. + */ DISABLE_INTERRUPTS(CLBR_NONE) - TRACE_IRQS_OFF /* * We must check ti flags with interrupts (or at least preemption) @@ -376,72 +279,73 @@ ret_from_sys_call: * flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is * very bad. */ - testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) - jnz int_ret_from_sys_call_fixup /* Go the the slow path */ + testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS) + jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */ CFI_REMEMBER_STATE - /* - * sysretq will re-enable interrupts: - */ - TRACE_IRQS_ON - movq RIP-ARGOFFSET(%rsp),%rcx + + RESTORE_C_REGS_EXCEPT_RCX_R11 + movq RIP(%rsp),%rcx CFI_REGISTER rip,rcx - RESTORE_ARGS 1,-ARG_SKIP,0 + movq EFLAGS(%rsp),%r11 /*CFI_REGISTER rflags,r11*/ - movq PER_CPU_VAR(old_rsp), %rsp + movq RSP(%rsp),%rsp + /* + * 64bit SYSRET restores rip from rcx, + * rflags from r11 (but RF and VM bits are forced to 0), + * cs and ss are loaded from MSRs. + * Restoration of rflags re-enables interrupts. + */ USERGS_SYSRET64 CFI_RESTORE_STATE -int_ret_from_sys_call_fixup: - FIXUP_TOP_OF_STACK %r11, -ARGOFFSET - jmp int_ret_from_sys_call_irqs_off - - /* Do syscall tracing */ + /* Do syscall entry tracing */ tracesys: - leaq -REST_SKIP(%rsp), %rdi - movq $AUDIT_ARCH_X86_64, %rsi + movq %rsp, %rdi + movl $AUDIT_ARCH_X86_64, %esi call syscall_trace_enter_phase1 test %rax, %rax jnz tracesys_phase2 /* if needed, run the slow path */ - LOAD_ARGS 0 /* else restore clobbered regs */ + RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ + movq ORIG_RAX(%rsp), %rax jmp system_call_fastpath /* and return to the fast path */ tracesys_phase2: - SAVE_REST - FIXUP_TOP_OF_STACK %rdi + SAVE_EXTRA_REGS movq %rsp, %rdi - movq $AUDIT_ARCH_X86_64, %rsi + movl $AUDIT_ARCH_X86_64, %esi movq %rax,%rdx call syscall_trace_enter_phase2 /* - * Reload arg registers from stack in case ptrace changed them. + * Reload registers from stack in case ptrace changed them. * We don't reload %rax because syscall_trace_entry_phase2() returned * the value it wants us to use in the table lookup. */ - LOAD_ARGS ARGOFFSET, 1 - RESTORE_REST + RESTORE_C_REGS_EXCEPT_RAX + RESTORE_EXTRA_REGS #if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max,%rax #else andl $__SYSCALL_MASK,%eax cmpl $__NR_syscall_max,%eax #endif - ja int_ret_from_sys_call /* RAX(%rsp) is already set */ + ja 1f /* return -ENOSYS (already in pt_regs->ax) */ movq %r10,%rcx /* fixup for C */ call *sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) - /* Use IRET because user could have changed frame */ + movq %rax,RAX(%rsp) +1: + /* Use IRET because user could have changed pt_regs->foo */ /* * Syscall return path ending with IRET. - * Has correct top of stack, but partial stack frame. + * Has correct iret frame. */ GLOBAL(int_ret_from_sys_call) DISABLE_INTERRUPTS(CLBR_NONE) +int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */ TRACE_IRQS_OFF -int_ret_from_sys_call_irqs_off: movl $_TIF_ALLWORK_MASK,%edi /* edi: mask to check */ GLOBAL(int_with_check) @@ -450,8 +354,8 @@ GLOBAL(int_with_check) movl TI_flags(%rcx),%edx andl %edi,%edx jnz int_careful - andl $~TS_COMPAT,TI_status(%rcx) - jmp retint_swapgs + andl $~TS_COMPAT,TI_status(%rcx) + jmp syscall_return /* Either reschedule or signal or syscall exit tracking needed. */ /* First do a reschedule test. */ @@ -468,12 +372,11 @@ int_careful: TRACE_IRQS_OFF jmp int_with_check - /* handle signals and tracing -- both require a full stack frame */ + /* handle signals and tracing -- both require a full pt_regs */ int_very_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) -int_check_syscall_exit_work: - SAVE_REST + SAVE_EXTRA_REGS /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal @@ -492,86 +395,192 @@ int_signal: call do_notify_resume 1: movl $_TIF_WORK_MASK,%edi int_restore_rest: - RESTORE_REST + RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check + +syscall_return: + /* The IRETQ could re-enable interrupts: */ + DISABLE_INTERRUPTS(CLBR_ANY) + TRACE_IRQS_IRETQ + + /* + * Try to use SYSRET instead of IRET if we're returning to + * a completely clean 64-bit userspace context. + */ + movq RCX(%rsp),%rcx + cmpq %rcx,RIP(%rsp) /* RCX == RIP */ + jne opportunistic_sysret_failed + + /* + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP + * in kernel space. This essentially lets the user take over + * the kernel, since userspace controls RSP. It's not worth + * testing for canonicalness exactly -- this check detects any + * of the 17 high bits set, which is true for non-canonical + * or kernel addresses. (This will pessimize vsyscall=native. + * Big deal.) + * + * If virtual addresses ever become wider, this will need + * to be updated to remain correct on both old and new CPUs. + */ + .ifne __VIRTUAL_MASK_SHIFT - 47 + .error "virtual address width changed -- SYSRET checks need update" + .endif + shr $__VIRTUAL_MASK_SHIFT, %rcx + jnz opportunistic_sysret_failed + + cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ + jne opportunistic_sysret_failed + + movq R11(%rsp),%r11 + cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ + jne opportunistic_sysret_failed + + /* + * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET, + * restoring TF results in a trap from userspace immediately after + * SYSRET. This would cause an infinite loop whenever #DB happens + * with register state that satisfies the opportunistic SYSRET + * conditions. For example, single-stepping this user code: + * + * movq $stuck_here,%rcx + * pushfq + * popq %r11 + * stuck_here: + * + * would never get past 'stuck_here'. + */ + testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 + jnz opportunistic_sysret_failed + + /* nothing to check for RSP */ + + cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ + jne opportunistic_sysret_failed + + /* + * We win! This label is here just for ease of understanding + * perf profiles. Nothing jumps here. + */ +syscall_return_via_sysret: + CFI_REMEMBER_STATE + /* r11 is already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_R11 + movq RSP(%rsp),%rsp + USERGS_SYSRET64 + CFI_RESTORE_STATE + +opportunistic_sysret_failed: + SWAPGS + jmp restore_c_regs_and_iret CFI_ENDPROC END(system_call) + .macro FORK_LIKE func ENTRY(stub_\func) CFI_STARTPROC - popq %r11 /* save return address */ - PARTIAL_FRAME 0 - SAVE_REST - pushq %r11 /* put it back on stack */ - FIXUP_TOP_OF_STACK %r11, 8 - DEFAULT_FRAME 0 8 /* offset 8: return address */ - call sys_\func - RESTORE_TOP_OF_STACK %r11, 8 - ret $REST_SKIP /* pop extended registers */ + DEFAULT_FRAME 0, 8 /* offset 8: return address */ + SAVE_EXTRA_REGS 8 + jmp sys_\func CFI_ENDPROC END(stub_\func) .endm - .macro FIXED_FRAME label,func -ENTRY(\label) - CFI_STARTPROC - PARTIAL_FRAME 0 8 /* offset 8: return address */ - FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET - call \func - RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET - ret - CFI_ENDPROC -END(\label) - .endm - FORK_LIKE clone FORK_LIKE fork FORK_LIKE vfork - FIXED_FRAME stub_iopl, sys_iopl ENTRY(stub_execve) CFI_STARTPROC - addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 - call sys_execve - movq %rax,RAX(%rsp) - RESTORE_REST - jmp int_ret_from_sys_call + DEFAULT_FRAME 0, 8 + call sys_execve +return_from_execve: + testl %eax, %eax + jz 1f + /* exec failed, can use fast SYSRET code path in this case */ + ret +1: + /* must use IRET code path (pt_regs->cs may have changed) */ + addq $8, %rsp + CFI_ADJUST_CFA_OFFSET -8 + ZERO_EXTRA_REGS + movq %rax,RAX(%rsp) + jmp int_ret_from_sys_call CFI_ENDPROC END(stub_execve) - -ENTRY(stub_execveat) +/* + * Remaining execve stubs are only 7 bytes long. + * ENTRY() often aligns to 16 bytes, which in this case has no benefits. + */ + .align 8 +GLOBAL(stub_execveat) CFI_STARTPROC - addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 - call sys_execveat - RESTORE_TOP_OF_STACK %r11 - movq %rax,RAX(%rsp) - RESTORE_REST - jmp int_ret_from_sys_call + DEFAULT_FRAME 0, 8 + call sys_execveat + jmp return_from_execve CFI_ENDPROC END(stub_execveat) +#ifdef CONFIG_X86_X32_ABI + .align 8 +GLOBAL(stub_x32_execve) + CFI_STARTPROC + DEFAULT_FRAME 0, 8 + call compat_sys_execve + jmp return_from_execve + CFI_ENDPROC +END(stub_x32_execve) + .align 8 +GLOBAL(stub_x32_execveat) + CFI_STARTPROC + DEFAULT_FRAME 0, 8 + call compat_sys_execveat + jmp return_from_execve + CFI_ENDPROC +END(stub_x32_execveat) +#endif + +#ifdef CONFIG_IA32_EMULATION + .align 8 +GLOBAL(stub32_execve) + CFI_STARTPROC + call compat_sys_execve + jmp return_from_execve + CFI_ENDPROC +END(stub32_execve) + .align 8 +GLOBAL(stub32_execveat) + CFI_STARTPROC + call compat_sys_execveat + jmp return_from_execve + CFI_ENDPROC +END(stub32_execveat) +#endif + /* * sigreturn is special because it needs to restore all registers on return. * This cannot be done with SYSRET, so use the IRET return path instead. */ ENTRY(stub_rt_sigreturn) CFI_STARTPROC - addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 + DEFAULT_FRAME 0, 8 + /* + * SAVE_EXTRA_REGS result is not normally needed: + * sigreturn overwrites all pt_regs->GPREGS. + * But sigreturn can fail (!), and there is no easy way to detect that. + * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error, + * we SAVE_EXTRA_REGS here. + */ + SAVE_EXTRA_REGS 8 call sys_rt_sigreturn - movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer - RESTORE_REST +return_from_stub: + addq $8, %rsp + CFI_ADJUST_CFA_OFFSET -8 + RESTORE_EXTRA_REGS + movq %rax,RAX(%rsp) jmp int_ret_from_sys_call CFI_ENDPROC END(stub_rt_sigreturn) @@ -579,86 +588,70 @@ END(stub_rt_sigreturn) #ifdef CONFIG_X86_X32_ABI ENTRY(stub_x32_rt_sigreturn) CFI_STARTPROC - addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 + DEFAULT_FRAME 0, 8 + SAVE_EXTRA_REGS 8 call sys32_x32_rt_sigreturn - movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer - RESTORE_REST - jmp int_ret_from_sys_call + jmp return_from_stub CFI_ENDPROC END(stub_x32_rt_sigreturn) +#endif -ENTRY(stub_x32_execve) - CFI_STARTPROC - addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 - call compat_sys_execve - RESTORE_TOP_OF_STACK %r11 - movq %rax,RAX(%rsp) - RESTORE_REST - jmp int_ret_from_sys_call - CFI_ENDPROC -END(stub_x32_execve) +/* + * A newly forked process directly context switches into this address. + * + * rdi: prev task we switched from + */ +ENTRY(ret_from_fork) + DEFAULT_FRAME -ENTRY(stub_x32_execveat) - CFI_STARTPROC - addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST - FIXUP_TOP_OF_STACK %r11 - call compat_sys_execveat - RESTORE_TOP_OF_STACK %r11 - movq %rax,RAX(%rsp) - RESTORE_REST + LOCK ; btr $TIF_FORK,TI_flags(%r8) + + pushq_cfi $0x0002 + popfq_cfi # reset kernel eflags + + call schedule_tail # rdi: 'prev' task parameter + + RESTORE_EXTRA_REGS + + testl $3,CS(%rsp) # from kernel_thread? + + /* + * By the time we get here, we have no idea whether our pt_regs, + * ti flags, and ti status came from the 64-bit SYSCALL fast path, + * the slow path, or one of the ia32entry paths. + * Use IRET code path to return, since it can safely handle + * all of the above. + */ + jnz int_ret_from_sys_call + + /* We came from kernel_thread */ + /* nb: we depend on RESTORE_EXTRA_REGS above */ + movq %rbp, %rdi + call *%rbx + movl $0, RAX(%rsp) + RESTORE_EXTRA_REGS jmp int_ret_from_sys_call CFI_ENDPROC -END(stub_x32_execveat) - -#endif +END(ret_from_fork) /* - * Build the entry stubs and pointer table with some assembler magic. - * We pack 7 stubs into a single 32-byte chunk, which will fit in a - * single cache line on all modern x86 implementations. + * Build the entry stubs with some assembler magic. + * We pack 1 stub into every 8-byte block. */ - .section .init.rodata,"a" -ENTRY(interrupt) - .section .entry.text - .p2align 5 - .p2align CONFIG_X86_L1_CACHE_SHIFT + .align 8 ENTRY(irq_entries_start) INTR_FRAME -vector=FIRST_EXTERNAL_VECTOR -.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7 - .balign 32 - .rept 7 - .if vector < FIRST_SYSTEM_VECTOR - .if vector <> FIRST_EXTERNAL_VECTOR + vector=FIRST_EXTERNAL_VECTOR + .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ + vector=vector+1 + jmp common_interrupt CFI_ADJUST_CFA_OFFSET -8 - .endif -1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */ - .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6 - jmp 2f - .endif - .previous - .quad 1b - .section .entry.text -vector=vector+1 - .endif - .endr -2: jmp common_interrupt -.endr + .align 8 + .endr CFI_ENDPROC END(irq_entries_start) -.previous -END(interrupt) -.previous - /* * Interrupt entry/exit. * @@ -669,47 +662,45 @@ END(interrupt) /* 0(%rsp): ~(interrupt number) */ .macro interrupt func - /* reserve pt_regs for scratch regs and rbp */ - subq $ORIG_RAX-RBP, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP cld - /* start from rbp in pt_regs and jump over */ - movq_cfi rdi, (RDI-RBP) - movq_cfi rsi, (RSI-RBP) - movq_cfi rdx, (RDX-RBP) - movq_cfi rcx, (RCX-RBP) - movq_cfi rax, (RAX-RBP) - movq_cfi r8, (R8-RBP) - movq_cfi r9, (R9-RBP) - movq_cfi r10, (R10-RBP) - movq_cfi r11, (R11-RBP) - - /* Save rbp so that we can unwind from get_irq_regs() */ - movq_cfi rbp, 0 - - /* Save previous stack value */ - movq %rsp, %rsi + /* + * Since nothing in interrupt handling code touches r12...r15 members + * of "struct pt_regs", and since interrupts can nest, we can save + * four stack slots and simultaneously provide + * an unwind-friendly stack layout by saving "truncated" pt_regs + * exactly up to rbp slot, without these members. + */ + ALLOC_PT_GPREGS_ON_STACK -RBP + SAVE_C_REGS -RBP + /* this goes to 0(%rsp) for unwinder, not for saving the value: */ + SAVE_EXTRA_REGS_RBP -RBP + + leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ - leaq -RBP(%rsp),%rdi /* arg1 for handler */ - testl $3, CS-RBP(%rsi) + testl $3, CS-RBP(%rsp) je 1f SWAPGS +1: /* + * Save previous stack pointer, optionally switch to interrupt stack. * irq_count is used to check if a CPU is already on an interrupt stack * or not. While this is essentially redundant with preempt_count it is * a little cheaper to use a separate counter in the PDA (short of * moving irq_enter into assembly, which would be too much work) */ -1: incl PER_CPU_VAR(irq_count) + movq %rsp, %rsi + incl PER_CPU_VAR(irq_count) cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp CFI_DEF_CFA_REGISTER rsi - - /* Store previous stack value */ pushq %rsi + /* + * For debugger: + * "CFA (Current Frame Address) is the value on stack + offset" + */ CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ - 0x77 /* DW_OP_breg7 */, 0, \ + 0x77 /* DW_OP_breg7 (rsp) */, 0, \ 0x06 /* DW_OP_deref */, \ - 0x08 /* DW_OP_const1u */, SS+8-RBP, \ + 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \ 0x22 /* DW_OP_plus */ /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF @@ -727,7 +718,7 @@ common_interrupt: ASM_CLAC addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ - /* 0(%rsp): old_rsp-ARGOFFSET */ + /* 0(%rsp): old RSP */ ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -735,19 +726,18 @@ ret_from_intr: /* Restore saved previous stack */ popq %rsi - CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ - leaq ARGOFFSET-RBP(%rsi), %rsp + CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */ + /* return code expects complete pt_regs - adjust rsp accordingly: */ + leaq -RBP(%rsi),%rsp CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET + CFI_ADJUST_CFA_OFFSET RBP -exit_intr: - GET_THREAD_INFO(%rcx) - testl $3,CS-ARGOFFSET(%rsp) + testl $3,CS(%rsp) je retint_kernel - /* Interrupt came from user space */ + + GET_THREAD_INFO(%rcx) /* - * Has a correct top of stack, but a partial stack frame * %rcx: thread info. Interrupts off. */ retint_with_reschedule: @@ -766,70 +756,34 @@ retint_swapgs: /* return to user-space */ DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_IRETQ - /* - * Try to use SYSRET instead of IRET if we're returning to - * a completely clean 64-bit userspace context. - */ - movq (RCX-R11)(%rsp), %rcx - cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */ - jne opportunistic_sysret_failed - - /* - * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP - * in kernel space. This essentially lets the user take over - * the kernel, since userspace controls RSP. It's not worth - * testing for canonicalness exactly -- this check detects any - * of the 17 high bits set, which is true for non-canonical - * or kernel addresses. (This will pessimize vsyscall=native. - * Big deal.) - * - * If virtual addresses ever become wider, this will need - * to be updated to remain correct on both old and new CPUs. - */ - .ifne __VIRTUAL_MASK_SHIFT - 47 - .error "virtual address width changed -- sysret checks need update" - .endif - shr $__VIRTUAL_MASK_SHIFT, %rcx - jnz opportunistic_sysret_failed - - cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */ - jne opportunistic_sysret_failed - - movq (R11-ARGOFFSET)(%rsp), %r11 - cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */ - jne opportunistic_sysret_failed - - testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */ - jnz opportunistic_sysret_failed - - /* nothing to check for RSP */ - - cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */ - jne opportunistic_sysret_failed - - /* - * We win! This label is here just for ease of understanding - * perf profiles. Nothing jumps here. - */ -irq_return_via_sysret: - CFI_REMEMBER_STATE - RESTORE_ARGS 1,8,1 - movq (RSP-RIP)(%rsp),%rsp - USERGS_SYSRET64 - CFI_RESTORE_STATE - -opportunistic_sysret_failed: SWAPGS - jmp restore_args + jmp restore_c_regs_and_iret -retint_restore_args: /* return to kernel space */ - DISABLE_INTERRUPTS(CLBR_ANY) +/* Returning to kernel space */ +retint_kernel: +#ifdef CONFIG_PREEMPT + /* Interrupts are off */ + /* Check if we need preemption */ + bt $9,EFLAGS(%rsp) /* interrupts were off? */ + jnc 1f +0: cmpl $0,PER_CPU_VAR(__preempt_count) + jnz 1f + call preempt_schedule_irq + jmp 0b +1: +#endif /* * The iretq could re-enable interrupts: */ TRACE_IRQS_IRETQ -restore_args: - RESTORE_ARGS 1,8,1 + +/* + * At this label, code paths which return to kernel and to user, + * which come from interrupts/exception and from syscalls, merge. + */ +restore_c_regs_and_iret: + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 irq_return: INTERRUPT_RETURN @@ -900,28 +854,17 @@ retint_signal: jz retint_swapgs TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_REST + SAVE_EXTRA_REGS movq $-1,ORIG_RAX(%rsp) xorl %esi,%esi # oldset movq %rsp,%rdi # &pt_regs call do_notify_resume - RESTORE_REST + RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) jmp retint_with_reschedule -#ifdef CONFIG_PREEMPT - /* Returning to kernel space. Check if we need preemption */ - /* rcx: threadinfo. interrupts off. */ -ENTRY(retint_kernel) - cmpl $0,PER_CPU_VAR(__preempt_count) - jnz retint_restore_args - bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ - jnc retint_restore_args - call preempt_schedule_irq - jmp exit_intr -#endif CFI_ENDPROC END(common_interrupt) @@ -1010,7 +953,7 @@ apicinterrupt IRQ_WORK_VECTOR \ /* * Exception entry points. */ -#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) @@ -1032,8 +975,7 @@ ENTRY(\sym) pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ .endif - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + ALLOC_PT_GPREGS_ON_STACK .if \paranoid .if \paranoid == 1 @@ -1041,10 +983,11 @@ ENTRY(\sym) testl $3, CS(%rsp) /* If coming from userspace, switch */ jnz 1f /* stacks. */ .endif - call save_paranoid + call paranoid_entry .else call error_entry .endif + /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ DEFAULT_FRAME 0 @@ -1066,19 +1009,20 @@ ENTRY(\sym) .endif .if \shift_ist != -1 - subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) .endif call \do_sym .if \shift_ist != -1 - addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) .endif + /* these procedures expect "no swapgs" flag in ebx */ .if \paranoid - jmp paranoid_exit /* %ebx: no swapgs flag */ + jmp paranoid_exit .else - jmp error_exit /* %ebx: no swapgs flag */ + jmp error_exit .endif .if \paranoid == 1 @@ -1282,7 +1226,9 @@ ENTRY(xen_failsafe_callback) addq $0x30,%rsp CFI_ADJUST_CFA_OFFSET -0x30 pushq_cfi $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL + ALLOC_PT_GPREGS_ON_STACK + SAVE_C_REGS + SAVE_EXTRA_REGS jmp error_exit CFI_ENDPROC END(xen_failsafe_callback) @@ -1314,59 +1260,66 @@ idtentry async_page_fault do_async_page_fault has_error_code=1 idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) #endif - /* - * "Paranoid" exit path from exception stack. This is invoked - * only on return from non-NMI IST interrupts that came - * from kernel space. - * - * We may be returning to very strange contexts (e.g. very early - * in syscall entry), so checking for preemption here would - * be complicated. Fortunately, we there's no good reason - * to try to handle preemption here. - */ +/* + * Save all registers in pt_regs, and switch gs if needed. + * Use slow, but surefire "are we in kernel?" check. + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + */ +ENTRY(paranoid_entry) + XCPT_FRAME 1 15*8 + cld + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 + movl $1,%ebx + movl $MSR_GS_BASE,%ecx + rdmsr + testl %edx,%edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx,%ebx +1: ret + CFI_ENDPROC +END(paranoid_entry) - /* ebx: no swapgs flag */ +/* + * "Paranoid" exit path from exception stack. This is invoked + * only on return from non-NMI IST interrupts that came + * from kernel space. + * + * We may be returning to very strange contexts (e.g. very early + * in syscall entry), so checking for preemption here would + * be complicated. Fortunately, we there's no good reason + * to try to handle preemption here. + */ +/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF_DEBUG testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_restore - TRACE_IRQS_IRETQ 0 + jnz paranoid_exit_no_swapgs + TRACE_IRQS_IRETQ SWAPGS_UNSAFE_STACK - RESTORE_ALL 8 - INTERRUPT_RETURN -paranoid_restore: - TRACE_IRQS_IRETQ_DEBUG 0 - RESTORE_ALL 8 + jmp paranoid_exit_restore +paranoid_exit_no_swapgs: + TRACE_IRQS_IRETQ_DEBUG +paranoid_exit_restore: + RESTORE_EXTRA_REGS + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 INTERRUPT_RETURN CFI_ENDPROC END(paranoid_exit) /* - * Exception entry point. This expects an error code/orig_rax on the stack. - * returns in "no swapgs flag" in %ebx. + * Save all registers in pt_regs, and switch gs if needed. + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(error_entry) - XCPT_FRAME - CFI_ADJUST_CFA_OFFSET 15*8 - /* oldrax contains error code */ + XCPT_FRAME 1 15*8 cld - movq %rdi, RDI+8(%rsp) - movq %rsi, RSI+8(%rsp) - movq %rdx, RDX+8(%rsp) - movq %rcx, RCX+8(%rsp) - movq %rax, RAX+8(%rsp) - movq %r8, R8+8(%rsp) - movq %r9, R9+8(%rsp) - movq %r10, R10+8(%rsp) - movq %r11, R11+8(%rsp) - movq_cfi rbx, RBX+8 - movq %rbp, RBP+8(%rsp) - movq %r12, R12+8(%rsp) - movq %r13, R13+8(%rsp) - movq %r14, R14+8(%rsp) - movq %r15, R15+8(%rsp) + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 xorl %ebx,%ebx testl $3,CS+8(%rsp) je error_kernelspace @@ -1376,12 +1329,12 @@ error_sti: TRACE_IRQS_OFF ret -/* - * There are two places in the kernel that can potentially fault with - * usergs. Handle them here. B stepping K8s sometimes report a - * truncated RIP for IRET exceptions returning to compat mode. Check - * for these here too. - */ + /* + * There are two places in the kernel that can potentially fault with + * usergs. Handle them here. B stepping K8s sometimes report a + * truncated RIP for IRET exceptions returning to compat mode. Check + * for these here too. + */ error_kernelspace: CFI_REL_OFFSET rcx, RCX+8 incl %ebx @@ -1411,11 +1364,11 @@ error_bad_iret: END(error_entry) -/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ +/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(error_exit) DEFAULT_FRAME movl %ebx,%eax - RESTORE_REST + RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) @@ -1430,19 +1383,7 @@ ENTRY(error_exit) CFI_ENDPROC END(error_exit) -/* - * Test if a given stack is an NMI stack or not. - */ - .macro test_in_nmi reg stack nmi_ret normal_ret - cmpq %\reg, \stack - ja \normal_ret - subq $EXCEPTION_STKSZ, %\reg - cmpq %\reg, \stack - jb \normal_ret - jmp \nmi_ret - .endm - - /* runs on exception stack */ +/* Runs on exception stack */ ENTRY(nmi) INTR_FRAME PARAVIRT_ADJUST_EXCEPTION_FRAME @@ -1478,7 +1419,7 @@ ENTRY(nmi) * NMI. */ - /* Use %rdx as out temp variable throughout */ + /* Use %rdx as our temp variable throughout */ pushq_cfi %rdx CFI_REL_OFFSET rdx, 0 @@ -1503,8 +1444,17 @@ ENTRY(nmi) * We check the variable because the first NMI could be in a * breakpoint routine using a breakpoint stack. */ - lea 6*8(%rsp), %rdx - test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi + lea 6*8(%rsp), %rdx + /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */ + cmpq %rdx, 4*8(%rsp) + /* If the stack pointer is above the NMI stack, this is a normal NMI */ + ja first_nmi + subq $EXCEPTION_STKSZ, %rdx + cmpq %rdx, 4*8(%rsp) + /* If it is below the NMI stack, it is a normal NMI */ + jb first_nmi + /* Ah, it is within the NMI stack, treat it as nested */ + CFI_REMEMBER_STATE nested_nmi: @@ -1597,7 +1547,7 @@ first_nmi: .rept 5 pushq_cfi 11*8(%rsp) .endr - CFI_DEF_CFA_OFFSET SS+8-RIP + CFI_DEF_CFA_OFFSET 5*8 /* Everything up to here is safe from nested NMIs */ @@ -1625,7 +1575,7 @@ repeat_nmi: pushq_cfi -6*8(%rsp) .endr subq $(5*8), %rsp - CFI_DEF_CFA_OFFSET SS+8-RIP + CFI_DEF_CFA_OFFSET 5*8 end_repeat_nmi: /* @@ -1634,16 +1584,16 @@ end_repeat_nmi: * so that we repeat another NMI. */ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + ALLOC_PT_GPREGS_ON_STACK + /* - * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit + * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit * as we should not be calling schedule in NMI context. * Even with normal interrupts enabled. An NMI should not be * setting NEED_RESCHED or anything that normal interrupts and * exceptions might do. */ - call save_paranoid + call paranoid_entry DEFAULT_FRAME 0 /* @@ -1674,8 +1624,10 @@ end_repeat_nmi: nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: + RESTORE_EXTRA_REGS + RESTORE_C_REGS /* Pop the extra iret frame at once */ - RESTORE_ALL 6*8 + REMOVE_PT_GPREGS_FROM_STACK 6*8 /* Clear the NMI executing stack variable */ movq $0, 5*8(%rsp) diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c index c4f8d46..2b55ee6 100644 --- a/arch/x86/kernel/head64.c +++ b/arch/x86/kernel/head64.c @@ -177,9 +177,6 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data) */ load_ucode_bsp(); - if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG) - early_printk("Kernel alive\n"); - clear_page(init_level4_pgt); /* set init_level4_pgt kernel high mapping*/ init_level4_pgt[511] = early_level4_pgt[511]; diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index f36bd42..d031bad 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -22,6 +22,7 @@ #include <asm/cpufeature.h> #include <asm/percpu.h> #include <asm/nops.h> +#include <asm/bootparam.h> /* Physical address */ #define pa(X) ((X) - __PAGE_OFFSET) @@ -90,7 +91,7 @@ ENTRY(startup_32) /* test KEEP_SEGMENTS flag to see if the bootloader is asking us to not reload segments */ - testb $(1<<6), BP_loadflags(%esi) + testb $KEEP_SEGMENTS, BP_loadflags(%esi) jnz 2f /* diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 6fd514d9..ae6588b 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -1,5 +1,5 @@ /* - * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit + * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit * * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> @@ -56,7 +56,7 @@ startup_64: * %rsi holds a physical pointer to real_mode_data. * * We come here either directly from a 64bit bootloader, or from - * arch/x86_64/boot/compressed/head.S. + * arch/x86/boot/compressed/head_64.S. * * We only come here initially at boot nothing else comes here. * @@ -146,7 +146,7 @@ startup_64: leaq level2_kernel_pgt(%rip), %rdi leaq 4096(%rdi), %r8 /* See if it is a valid page table entry */ -1: testq $1, 0(%rdi) +1: testb $1, 0(%rdi) jz 2f addq %rbp, 0(%rdi) /* Go to the next page */ diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c index d5651fc..367f39d 100644 --- a/arch/x86/kernel/i387.c +++ b/arch/x86/kernel/i387.c @@ -42,8 +42,8 @@ void kernel_fpu_enable(void) * be set (so that the clts/stts pair does nothing that is * visible in the interrupted kernel thread). * - * Except for the eagerfpu case when we return 1 unless we've already - * been eager and saved the state in kernel_fpu_begin(). + * Except for the eagerfpu case when we return true; in the likely case + * the thread has FPU but we are not going to set/clear TS. */ static inline bool interrupted_kernel_fpu_idle(void) { @@ -51,7 +51,7 @@ static inline bool interrupted_kernel_fpu_idle(void) return false; if (use_eager_fpu()) - return __thread_has_fpu(current); + return true; return !__thread_has_fpu(current) && (read_cr0() & X86_CR0_TS); @@ -68,7 +68,7 @@ static inline bool interrupted_kernel_fpu_idle(void) static inline bool interrupted_user_mode(void) { struct pt_regs *regs = get_irq_regs(); - return regs && user_mode_vm(regs); + return regs && user_mode(regs); } /* @@ -94,9 +94,10 @@ void __kernel_fpu_begin(void) if (__thread_has_fpu(me)) { __save_init_fpu(me); - } else if (!use_eager_fpu()) { + } else { this_cpu_write(fpu_owner_task, NULL); - clts(); + if (!use_eager_fpu()) + clts(); } } EXPORT_SYMBOL(__kernel_fpu_begin); @@ -107,7 +108,7 @@ void __kernel_fpu_end(void) if (__thread_has_fpu(me)) { if (WARN_ON(restore_fpu_checking(me))) - drop_init_fpu(me); + fpu_reset_state(me); } else if (!use_eager_fpu()) { stts(); } @@ -120,10 +121,13 @@ void unlazy_fpu(struct task_struct *tsk) { preempt_disable(); if (__thread_has_fpu(tsk)) { - __save_init_fpu(tsk); - __thread_fpu_end(tsk); - } else - tsk->thread.fpu_counter = 0; + if (use_eager_fpu()) { + __save_fpu(tsk); + } else { + __save_init_fpu(tsk); + __thread_fpu_end(tsk); + } + } preempt_enable(); } EXPORT_SYMBOL(unlazy_fpu); @@ -221,11 +225,12 @@ void fpu_finit(struct fpu *fpu) return; } + memset(fpu->state, 0, xstate_size); + if (cpu_has_fxsr) { fx_finit(&fpu->state->fxsave); } else { struct i387_fsave_struct *fp = &fpu->state->fsave; - memset(fp, 0, xstate_size); fp->cwd = 0xffff037fu; fp->swd = 0xffff0000u; fp->twd = 0xffffffffu; @@ -247,7 +252,7 @@ int init_fpu(struct task_struct *tsk) if (tsk_used_math(tsk)) { if (cpu_has_fpu && tsk == current) unlazy_fpu(tsk); - tsk->thread.fpu.last_cpu = ~0; + task_disable_lazy_fpu_restore(tsk); return 0; } @@ -336,6 +341,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, void *kbuf, void __user *ubuf) { + struct xsave_struct *xsave = &target->thread.fpu.state->xsave; int ret; if (!cpu_has_xsave) @@ -350,14 +356,12 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset, * memory layout in the thread struct, so that we can copy the entire * xstateregs to the user using one user_regset_copyout(). */ - memcpy(&target->thread.fpu.state->fxsave.sw_reserved, - xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); - + memcpy(&xsave->i387.sw_reserved, + xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes)); /* * Copy the xstate memory layout. */ - ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu.state->xsave, 0, -1); + ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); return ret; } @@ -365,8 +369,8 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, unsigned int pos, unsigned int count, const void *kbuf, const void __user *ubuf) { + struct xsave_struct *xsave = &target->thread.fpu.state->xsave; int ret; - struct xsave_hdr_struct *xsave_hdr; if (!cpu_has_xsave) return -ENODEV; @@ -375,22 +379,16 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, if (ret) return ret; - ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, - &target->thread.fpu.state->xsave, 0, -1); - + ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); /* * mxcsr reserved bits must be masked to zero for security reasons. */ - target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask; - - xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr; - - xsave_hdr->xstate_bv &= pcntxt_mask; + xsave->i387.mxcsr &= mxcsr_feature_mask; + xsave->xsave_hdr.xstate_bv &= pcntxt_mask; /* * These bits must be zero. */ - memset(xsave_hdr->reserved, 0, 48); - + memset(&xsave->xsave_hdr.reserved, 0, 48); return ret; } diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 4ddaf66..37dae79 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -54,7 +54,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * because the ->io_bitmap_max value must match the bitmap * contents: */ - tss = &per_cpu(init_tss, get_cpu()); + tss = &per_cpu(cpu_tss, get_cpu()); if (turn_on) bitmap_clear(t->io_bitmap_ptr, from, num); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 67b1cbe..e5952c2 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -295,7 +295,7 @@ int check_irq_vectors_for_cpu_disable(void) this_cpu = smp_processor_id(); cpumask_copy(&online_new, cpu_online_mask); - cpu_clear(this_cpu, online_new); + cpumask_clear_cpu(this_cpu, &online_new); this_count = 0; for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) { @@ -307,7 +307,7 @@ int check_irq_vectors_for_cpu_disable(void) data = irq_desc_get_irq_data(desc); cpumask_copy(&affinity_new, data->affinity); - cpu_clear(this_cpu, affinity_new); + cpumask_clear_cpu(this_cpu, &affinity_new); /* Do not count inactive or per-cpu irqs. */ if (!irq_has_action(irq) || irqd_is_per_cpu(data)) diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 28d28f5..f9fd86a 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -165,7 +165,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs) if (unlikely(!desc)) return false; - if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) { + if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) { if (unlikely(overflow)) print_stack_overflow(); desc->handle_irq(irq, desc); diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index e4b503d..394e643 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -44,7 +44,7 @@ static inline void stack_overflow_check(struct pt_regs *regs) u64 estack_top, estack_bottom; u64 curbase = (u64)task_stack_page(current); - if (user_mode_vm(regs)) + if (user_mode(regs)) return; if (regs->sp >= curbase + sizeof(struct thread_info) + diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c index 70e181e..cd10a64 100644 --- a/arch/x86/kernel/irqinit.c +++ b/arch/x86/kernel/irqinit.c @@ -178,7 +178,8 @@ void __init native_init_IRQ(void) #endif for_each_clear_bit_from(i, used_vectors, first_system_vector) { /* IA32_SYSCALL_VECTOR could be used in trap_init already. */ - set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); + set_intr_gate(i, irq_entries_start + + 8 * (i - FIRST_EXTERNAL_VECTOR)); } #ifdef CONFIG_X86_LOCAL_APIC for_each_clear_bit_from(i, used_vectors, NR_VECTORS) diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index 7ec1d5f..d6178d9 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -72,7 +72,7 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] = { "bx", 8, offsetof(struct pt_regs, bx) }, { "cx", 8, offsetof(struct pt_regs, cx) }, { "dx", 8, offsetof(struct pt_regs, dx) }, - { "si", 8, offsetof(struct pt_regs, dx) }, + { "si", 8, offsetof(struct pt_regs, si) }, { "di", 8, offsetof(struct pt_regs, di) }, { "bp", 8, offsetof(struct pt_regs, bp) }, { "sp", 8, offsetof(struct pt_regs, sp) }, @@ -126,11 +126,11 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs) #ifdef CONFIG_X86_32 switch (regno) { case GDB_SS: - if (!user_mode_vm(regs)) + if (!user_mode(regs)) *(unsigned long *)mem = __KERNEL_DS; break; case GDB_SP: - if (!user_mode_vm(regs)) + if (!user_mode(regs)) *(unsigned long *)mem = kernel_stack_pointer(regs); break; case GDB_GS: diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 03189d8..1deffe6 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -605,7 +605,7 @@ int kprobe_int3_handler(struct pt_regs *regs) struct kprobe *p; struct kprobe_ctlblk *kcb; - if (user_mode_vm(regs)) + if (user_mode(regs)) return 0; addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t)); @@ -1010,7 +1010,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val, struct die_args *args = data; int ret = NOTIFY_DONE; - if (args->regs && user_mode_vm(args->regs)) + if (args->regs && user_mode(args->regs)) return ret; if (val == DIE_GPF) { diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index d1ac80b..005c03e 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -33,6 +33,7 @@ #include <asm/page.h> #include <asm/pgtable.h> +#include <asm/setup.h> #if 0 #define DEBUGP(fmt, ...) \ @@ -47,21 +48,13 @@ do { \ #ifdef CONFIG_RANDOMIZE_BASE static unsigned long module_load_offset; -static int randomize_modules = 1; /* Mutex protects the module_load_offset. */ static DEFINE_MUTEX(module_kaslr_mutex); -static int __init parse_nokaslr(char *p) -{ - randomize_modules = 0; - return 0; -} -early_param("nokaslr", parse_nokaslr); - static unsigned long int get_module_load_offset(void) { - if (randomize_modules) { + if (kaslr_enabled()) { mutex_lock(&module_kaslr_mutex); /* * Calculate the module_load_offset the first time this diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index 781861c..da8cb98 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -131,10 +131,11 @@ void perf_get_regs_user(struct perf_regs *regs_user, } /* - * RIP, flags, and the argument registers are usually saved. - * orig_ax is probably okay, too. + * These registers are always saved on 64-bit syscall entry. + * On 32-bit entry points, they are saved too except r8..r11. */ regs_user_copy->ip = user_regs->ip; + regs_user_copy->ax = user_regs->ax; regs_user_copy->cx = user_regs->cx; regs_user_copy->dx = user_regs->dx; regs_user_copy->si = user_regs->si; @@ -145,9 +146,12 @@ void perf_get_regs_user(struct perf_regs *regs_user, regs_user_copy->r11 = user_regs->r11; regs_user_copy->orig_ax = user_regs->orig_ax; regs_user_copy->flags = user_regs->flags; + regs_user_copy->sp = user_regs->sp; + regs_user_copy->cs = user_regs->cs; + regs_user_copy->ss = user_regs->ss; /* - * Don't even try to report the "rest" regs. + * Most system calls don't save these registers, don't report them. */ regs_user_copy->bx = -1; regs_user_copy->bp = -1; @@ -158,37 +162,13 @@ void perf_get_regs_user(struct perf_regs *regs_user, /* * For this to be at all useful, we need a reasonable guess for - * sp and the ABI. Be careful: we're in NMI context, and we're + * the ABI. Be careful: we're in NMI context, and we're * considering current to be the current task, so we should * be careful not to look at any other percpu variables that might * change during context switches. */ - if (IS_ENABLED(CONFIG_IA32_EMULATION) && - task_thread_info(current)->status & TS_COMPAT) { - /* Easy case: we're in a compat syscall. */ - regs_user->abi = PERF_SAMPLE_REGS_ABI_32; - regs_user_copy->sp = user_regs->sp; - regs_user_copy->cs = user_regs->cs; - regs_user_copy->ss = user_regs->ss; - } else if (user_regs->orig_ax != -1) { - /* - * We're probably in a 64-bit syscall. - * Warning: this code is severely racy. At least it's better - * than just blindly copying user_regs. - */ - regs_user->abi = PERF_SAMPLE_REGS_ABI_64; - regs_user_copy->sp = this_cpu_read(old_rsp); - regs_user_copy->cs = __USER_CS; - regs_user_copy->ss = __USER_DS; - regs_user_copy->cx = -1; /* usually contains garbage */ - } else { - /* We're probably in an interrupt or exception. */ - regs_user->abi = user_64bit_mode(user_regs) ? - PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32; - regs_user_copy->sp = user_regs->sp; - regs_user_copy->cs = user_regs->cs; - regs_user_copy->ss = user_regs->ss; - } + regs_user->abi = user_64bit_mode(user_regs) ? + PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32; regs_user->regs = regs_user_copy; } diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 046e2d6..8213da6 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -9,7 +9,7 @@ #include <linux/sched.h> #include <linux/module.h> #include <linux/pm.h> -#include <linux/clockchips.h> +#include <linux/tick.h> #include <linux/random.h> #include <linux/user-return-notifier.h> #include <linux/dmi.h> @@ -24,6 +24,7 @@ #include <asm/syscalls.h> #include <asm/idle.h> #include <asm/uaccess.h> +#include <asm/mwait.h> #include <asm/i387.h> #include <asm/fpu-internal.h> #include <asm/debugreg.h> @@ -37,7 +38,26 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { + .x86_tss = { + .sp0 = TOP_OF_INIT_STACK, +#ifdef CONFIG_X86_32 + .ss0 = __KERNEL_DS, + .ss1 = __KERNEL_CS, + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, +#endif + }, +#ifdef CONFIG_X86_32 + /* + * Note that the .io_bitmap member must be extra-big. This is because + * the CPU will access an additional byte beyond the end of the IO + * permission bitmap. The extra byte must be all 1 bits, and must + * be within the limit. + */ + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, +#endif +}; +EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss); #ifdef CONFIG_X86_64 static DEFINE_PER_CPU(unsigned char, is_idle); @@ -69,8 +89,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src) dst->thread.fpu_counter = 0; dst->thread.fpu.has_fpu = 0; - dst->thread.fpu.last_cpu = ~0; dst->thread.fpu.state = NULL; + task_disable_lazy_fpu_restore(dst); if (tsk_used_math(src)) { int err = fpu_alloc(&dst->thread.fpu); if (err) @@ -109,7 +129,7 @@ void exit_thread(void) unsigned long *bp = t->io_bitmap_ptr; if (bp) { - struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); t->io_bitmap_ptr = NULL; clear_thread_flag(TIF_IO_BITMAP); @@ -131,13 +151,18 @@ void flush_thread(void) flush_ptrace_hw_breakpoint(tsk); memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array)); - drop_init_fpu(tsk); - /* - * Free the FPU state for non xsave platforms. They get reallocated - * lazily at the first use. - */ - if (!use_eager_fpu()) + + if (!use_eager_fpu()) { + /* FPU state will be reallocated lazily at the first use. */ + drop_fpu(tsk); free_thread_xstate(tsk); + } else if (!used_math()) { + /* kthread execs. TODO: cleanup this horror. */ + if (WARN_ON(init_fpu(tsk))) + force_sig(SIGKILL, tsk); + user_fpu_begin(); + restore_init_xstate(); + } } static void hard_disable_TSC(void) @@ -377,14 +402,11 @@ static void amd_e400_idle(void) if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) { cpumask_set_cpu(cpu, amd_e400_c1e_mask); - /* - * Force broadcast so ACPI can not interfere. - */ - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE, - &cpu); + /* Force broadcast so ACPI can not interfere. */ + tick_broadcast_force(); pr_info("Switch to broadcast mode on CPU%d\n", cpu); } - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu); + tick_broadcast_enter(); default_idle(); @@ -393,12 +415,59 @@ static void amd_e400_idle(void) * called with interrupts disabled. */ local_irq_disable(); - clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu); + tick_broadcast_exit(); local_irq_enable(); } else default_idle(); } +/* + * Intel Core2 and older machines prefer MWAIT over HALT for C1. + * We can't rely on cpuidle installing MWAIT, because it will not load + * on systems that support only C1 -- so the boot default must be MWAIT. + * + * Some AMD machines are the opposite, they depend on using HALT. + * + * So for default C1, which is used during boot until cpuidle loads, + * use MWAIT-C1 on Intel HW that has it, else use HALT. + */ +static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c) +{ + if (c->x86_vendor != X86_VENDOR_INTEL) + return 0; + + if (!cpu_has(c, X86_FEATURE_MWAIT)) + return 0; + + return 1; +} + +/* + * MONITOR/MWAIT with no hints, used for default default C1 state. + * This invokes MWAIT with interrutps enabled and no flags, + * which is backwards compatible with the original MWAIT implementation. + */ + +static void mwait_idle(void) +{ + if (!current_set_polling_and_test()) { + if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) { + smp_mb(); /* quirk */ + clflush((void *)¤t_thread_info()->flags); + smp_mb(); /* quirk */ + } + + __monitor((void *)¤t_thread_info()->flags, 0, 0); + if (!need_resched()) + __sti_mwait(0, 0); + else + local_irq_enable(); + } else { + local_irq_enable(); + } + __current_clr_polling(); +} + void select_idle_routine(const struct cpuinfo_x86 *c) { #ifdef CONFIG_SMP @@ -412,6 +481,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c) /* E400: APIC timer interrupt does not wake up CPU from C1e */ pr_info("using AMD E400 aware idle routine\n"); x86_idle = amd_e400_idle; + } else if (prefer_mwait_c1_over_halt(c)) { + pr_info("using mwait in idle threads\n"); + x86_idle = mwait_idle; } else x86_idle = default_idle; } diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 603c4f9..8ed2106 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -73,7 +73,7 @@ void __show_regs(struct pt_regs *regs, int all) unsigned long sp; unsigned short ss, gs; - if (user_mode_vm(regs)) { + if (user_mode(regs)) { sp = regs->sp; ss = regs->ss & 0xffff; gs = get_user_gs(regs); @@ -206,11 +206,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) regs->ip = new_ip; regs->sp = new_sp; regs->flags = X86_EFLAGS_IF; - /* - * force it to the iret return path by making it look as if there was - * some work pending. - */ - set_thread_flag(TIF_NOTIFY_RESUME); + force_iret(); } EXPORT_SYMBOL_GPL(start_thread); @@ -248,7 +244,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss, cpu); fpu_switch_t fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ @@ -256,11 +252,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) fpu = switch_fpu_prepare(prev_p, next_p, cpu); /* - * Reload esp0. - */ - load_sp0(tss, next); - - /* * Save away %gs. No need to save %fs, as it was saved on the * stack on entry. No need to save %es and %ds, as those are * always kernel segments while inside the kernel. Doing this @@ -310,9 +301,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ arch_end_context_switch(next_p); + /* + * Reload esp0, kernel_stack, and current_top_of_stack. This changes + * current_thread_info(). + */ + load_sp0(tss, next); this_cpu_write(kernel_stack, - (unsigned long)task_stack_page(next_p) + - THREAD_SIZE - KERNEL_STACK_OFFSET); + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE); + this_cpu_write(cpu_current_top_of_stack, + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE); /* * Restore %gs if needed (which is common) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 67fcc43..4baaa97 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ asmlinkage extern void ret_from_fork(void); -__visible DEFINE_PER_CPU(unsigned long, old_rsp); +__visible DEFINE_PER_CPU(unsigned long, rsp_scratch); /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, int all) @@ -161,7 +161,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; childregs = task_pt_regs(p); p->thread.sp = (unsigned long) childregs; - p->thread.usersp = me->thread.usersp; set_tsk_thread_flag(p, TIF_FORK); p->thread.io_bitmap_ptr = NULL; @@ -207,7 +206,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, */ if (clone_flags & CLONE_SETTLS) { #ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) + if (is_ia32_task()) err = do_set_thread_area(p, -1, (struct user_desc __user *)childregs->si, 0); else @@ -235,13 +234,12 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); - current->thread.usersp = new_sp; regs->ip = new_ip; regs->sp = new_sp; - this_cpu_write(old_rsp, new_sp); regs->cs = _cs; regs->ss = _ss; regs->flags = X86_EFLAGS_IF; + force_iret(); } void @@ -277,15 +275,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss, cpu); unsigned fsindex, gsindex; fpu_switch_t fpu; fpu = switch_fpu_prepare(prev_p, next_p, cpu); - /* Reload esp0 and ss1. */ - load_sp0(tss, next); - /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). * @@ -401,8 +396,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - prev->usersp = this_cpu_read(old_rsp); - this_cpu_write(old_rsp, next->usersp); this_cpu_write(current_task, next_p); /* @@ -413,9 +406,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); + /* Reload esp0 and ss1. This changes current_thread_info(). */ + load_sp0(tss, next); + this_cpu_write(kernel_stack, - (unsigned long)task_stack_page(next_p) + - THREAD_SIZE - KERNEL_STACK_OFFSET); + (unsigned long)task_stack_page(next_p) + THREAD_SIZE); /* * Now maybe reload the debug registers and handle I/O bitmaps @@ -602,6 +597,5 @@ long sys_arch_prctl(int code, unsigned long addr) unsigned long KSTK_ESP(struct task_struct *task) { - return (test_tsk_thread_flag(task, TIF_IA32)) ? - (task_pt_regs(task)->sp) : ((task)->thread.usersp); + return task_pt_regs(task)->sp; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e510618..a7bc794 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -364,18 +364,12 @@ static int set_segment_reg(struct task_struct *task, case offsetof(struct user_regs_struct,cs): if (unlikely(value == 0)) return -EIO; -#ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(task, TIF_IA32)) - task_pt_regs(task)->cs = value; -#endif + task_pt_regs(task)->cs = value; break; case offsetof(struct user_regs_struct,ss): if (unlikely(value == 0)) return -EIO; -#ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(task, TIF_IA32)) - task_pt_regs(task)->ss = value; -#endif + task_pt_regs(task)->ss = value; break; } @@ -1421,7 +1415,7 @@ static void fill_sigtrap_info(struct task_struct *tsk, memset(info, 0, sizeof(*info)); info->si_signo = SIGTRAP; info->si_code = si_code; - info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL; + info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL; } void user_single_step_siginfo(struct task_struct *tsk, diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c index 2f355d2..e5ecd20 100644 --- a/arch/x86/kernel/pvclock.c +++ b/arch/x86/kernel/pvclock.c @@ -141,7 +141,46 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock, set_normalized_timespec(ts, now.tv_sec, now.tv_nsec); } +static struct pvclock_vsyscall_time_info *pvclock_vdso_info; + +static struct pvclock_vsyscall_time_info * +pvclock_get_vsyscall_user_time_info(int cpu) +{ + if (!pvclock_vdso_info) { + BUG(); + return NULL; + } + + return &pvclock_vdso_info[cpu]; +} + +struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu) +{ + return &pvclock_get_vsyscall_user_time_info(cpu)->pvti; +} + #ifdef CONFIG_X86_64 +static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l, + void *v) +{ + struct task_migration_notifier *mn = v; + struct pvclock_vsyscall_time_info *pvti; + + pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu); + + /* this is NULL when pvclock vsyscall is not initialized */ + if (unlikely(pvti == NULL)) + return NOTIFY_DONE; + + pvti->migrate_count++; + + return NOTIFY_DONE; +} + +static struct notifier_block pvclock_migrate = { + .notifier_call = pvclock_task_migrate, +}; + /* * Initialize the generic pvclock vsyscall state. This will allocate * a/some page(s) for the per-vcpu pvclock information, set up a @@ -155,12 +194,17 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i, WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE); + pvclock_vdso_info = i; + for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) { __set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx, __pa(i) + (idx*PAGE_SIZE), PAGE_KERNEL_VVAR); } + + register_task_migration_notifier(&pvclock_migrate); + return 0; } #endif diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index bae6c60..86db4bc 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -183,6 +183,16 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = { }, }, + /* ASRock */ + { /* Handle problems with rebooting on ASRock Q1900DC-ITX */ + .callback = set_pci_reboot, + .ident = "ASRock Q1900DC-ITX", + .matches = { + DMI_MATCH(DMI_BOARD_VENDOR, "ASRock"), + DMI_MATCH(DMI_BOARD_NAME, "Q1900DC-ITX"), + }, + }, + /* ASUS */ { /* Handle problems with rebooting on ASUS P4S800 */ .callback = set_bios_reboot, diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index e13f8e7..77630d5 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -226,23 +226,23 @@ swap_pages: movl (%ebx), %ecx addl $4, %ebx 1: - testl $0x1, %ecx /* is it a destination page */ + testb $0x1, %cl /* is it a destination page */ jz 2f movl %ecx, %edi andl $0xfffff000, %edi jmp 0b 2: - testl $0x2, %ecx /* is it an indirection page */ + testb $0x2, %cl /* is it an indirection page */ jz 2f movl %ecx, %ebx andl $0xfffff000, %ebx jmp 0b 2: - testl $0x4, %ecx /* is it the done indicator */ + testb $0x4, %cl /* is it the done indicator */ jz 2f jmp 3f 2: - testl $0x8, %ecx /* is it the source indicator */ + testb $0x8, %cl /* is it the source indicator */ jz 0b /* Ignore it otherwise */ movl %ecx, %esi /* For every source page do a copy */ andl $0xfffff000, %esi diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 3fd2c69..98111b3 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -123,7 +123,7 @@ identity_mapped: * Set cr4 to a known state: * - physical address extension enabled */ - movq $X86_CR4_PAE, %rax + movl $X86_CR4_PAE, %eax movq %rax, %cr4 jmp 1f @@ -221,23 +221,23 @@ swap_pages: movq (%rbx), %rcx addq $8, %rbx 1: - testq $0x1, %rcx /* is it a destination page? */ + testb $0x1, %cl /* is it a destination page? */ jz 2f movq %rcx, %rdi andq $0xfffffffffffff000, %rdi jmp 0b 2: - testq $0x2, %rcx /* is it an indirection page? */ + testb $0x2, %cl /* is it an indirection page? */ jz 2f movq %rcx, %rbx andq $0xfffffffffffff000, %rbx jmp 0b 2: - testq $0x4, %rcx /* is it the done indicator? */ + testb $0x4, %cl /* is it the done indicator? */ jz 2f jmp 3f 2: - testq $0x8, %rcx /* is it the source indicator? */ + testb $0x8, %cl /* is it the source indicator? */ jz 0b /* Ignore it otherwise */ movq %rcx, %rsi /* For ever source page do a copy */ andq $0xfffffffffffff000, %rsi @@ -246,17 +246,17 @@ swap_pages: movq %rsi, %rax movq %r10, %rdi - movq $512, %rcx + movl $512, %ecx rep ; movsq movq %rax, %rdi movq %rdx, %rsi - movq $512, %rcx + movl $512, %ecx rep ; movsq movq %rdx, %rdi movq %r10, %rsi - movq $512, %rcx + movl $512, %ecx rep ; movsq lea PAGE_SIZE(%rax), %rsi diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0a2421c..d74ac33 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -354,7 +354,7 @@ static void __init relocate_initrd(void) mapaddr = ramdisk_image & PAGE_MASK; p = early_memremap(mapaddr, clen+slop); memcpy(q, p+slop, clen); - early_iounmap(p, clen+slop); + early_memunmap(p, clen+slop); q += clen; ramdisk_image += clen; ramdisk_size -= clen; @@ -438,7 +438,7 @@ static void __init parse_setup_data(void) data_len = data->len + sizeof(struct setup_data); data_type = data->type; pa_next = data->next; - early_iounmap(data, sizeof(*data)); + early_memunmap(data, sizeof(*data)); switch (data_type) { case SETUP_E820_EXT: @@ -470,7 +470,7 @@ static void __init e820_reserve_setup_data(void) E820_RAM, E820_RESERVED_KERN); found = 1; pa_data = data->next; - early_iounmap(data, sizeof(*data)); + early_memunmap(data, sizeof(*data)); } if (!found) return; @@ -491,7 +491,7 @@ static void __init memblock_x86_reserve_range_setup_data(void) data = early_memremap(pa_data, sizeof(*data)); memblock_reserve(pa_data, sizeof(*data) + data->len); pa_data = data->next; - early_iounmap(data, sizeof(*data)); + early_memunmap(data, sizeof(*data)); } } @@ -832,10 +832,15 @@ static void __init trim_low_memory_range(void) static int dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p) { - pr_emerg("Kernel Offset: 0x%lx from 0x%lx " - "(relocation range: 0x%lx-0x%lx)\n", - (unsigned long)&_text - __START_KERNEL, __START_KERNEL, - __START_KERNEL_map, MODULES_VADDR-1); + if (kaslr_enabled()) { + pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n", + (unsigned long)&_text - __START_KERNEL, + __START_KERNEL, + __START_KERNEL_map, + MODULES_VADDR-1); + } else { + pr_emerg("Kernel Offset: disabled\n"); + } return 0; } diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index e504246..3e58186 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -61,8 +61,7 @@ regs->seg = GET_SEG(seg) | 3; \ } while (0) -int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, - unsigned long *pax) +int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc) { void __user *buf; unsigned int tmpflags; @@ -81,7 +80,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, #endif /* CONFIG_X86_32 */ COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx); - COPY(dx); COPY(cx); COPY(ip); + COPY(dx); COPY(cx); COPY(ip); COPY(ax); #ifdef CONFIG_X86_64 COPY(r8); @@ -94,27 +93,20 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, COPY(r15); #endif /* CONFIG_X86_64 */ -#ifdef CONFIG_X86_32 COPY_SEG_CPL3(cs); COPY_SEG_CPL3(ss); -#else /* !CONFIG_X86_32 */ - /* Kernel saves and restores only the CS segment register on signals, - * which is the bare minimum needed to allow mixed 32/64-bit code. - * App's signal handler can save/restore other segments if needed. */ - COPY_SEG_CPL3(cs); -#endif /* CONFIG_X86_32 */ get_user_ex(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); regs->orig_ax = -1; /* disable syscall checks */ get_user_ex(buf, &sc->fpstate); - - get_user_ex(*pax, &sc->ax); } get_user_catch(err); err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); + force_iret(); + return err; } @@ -162,8 +154,9 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, #else /* !CONFIG_X86_32 */ put_user_ex(regs->flags, &sc->flags); put_user_ex(regs->cs, &sc->cs); - put_user_ex(0, &sc->gs); - put_user_ex(0, &sc->fs); + put_user_ex(0, &sc->__pad2); + put_user_ex(0, &sc->__pad1); + put_user_ex(regs->ss, &sc->ss); #endif /* CONFIG_X86_32 */ put_user_ex(fpstate, &sc->fpstate); @@ -457,9 +450,19 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, regs->sp = (unsigned long)frame; - /* Set up the CS register to run signal handlers in 64-bit mode, - even if the handler happens to be interrupting 32-bit code. */ + /* + * Set up the CS and SS registers to run signal handlers in + * 64-bit mode, even if the handler happens to be interrupting + * 32-bit or 16-bit code. + * + * SS is subtle. In 64-bit mode, we don't need any particular + * SS descriptor, but we do need SS to be valid. It's possible + * that the old SS is entirely bogus -- this can happen if the + * signal we're trying to deliver is #GP or #SS caused by a bad + * SS value. + */ regs->cs = __USER_CS; + regs->ss = __USER_DS; return 0; } @@ -539,7 +542,6 @@ asmlinkage unsigned long sys_sigreturn(void) { struct pt_regs *regs = current_pt_regs(); struct sigframe __user *frame; - unsigned long ax; sigset_t set; frame = (struct sigframe __user *)(regs->sp - 8); @@ -553,9 +555,9 @@ asmlinkage unsigned long sys_sigreturn(void) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->sc, &ax)) + if (restore_sigcontext(regs, &frame->sc)) goto badframe; - return ax; + return regs->ax; badframe: signal_fault(regs, frame, "sigreturn"); @@ -568,7 +570,6 @@ asmlinkage long sys_rt_sigreturn(void) { struct pt_regs *regs = current_pt_regs(); struct rt_sigframe __user *frame; - unsigned long ax; sigset_t set; frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long)); @@ -579,13 +580,13 @@ asmlinkage long sys_rt_sigreturn(void) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) goto badframe; if (restore_altstack(&frame->uc.uc_stack)) goto badframe; - return ax; + return regs->ax; badframe: signal_fault(regs, frame, "rt_sigreturn"); @@ -679,7 +680,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs) * Ensure the signal handler starts with the new fpu state. */ if (used_math()) - drop_init_fpu(current); + fpu_reset_state(current); } signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP)); } @@ -780,7 +781,6 @@ asmlinkage long sys32_x32_rt_sigreturn(void) struct pt_regs *regs = current_pt_regs(); struct rt_sigframe_x32 __user *frame; sigset_t set; - unsigned long ax; frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8); @@ -791,13 +791,13 @@ asmlinkage long sys32_x32_rt_sigreturn(void) set_current_blocked(&set); - if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax)) + if (restore_sigcontext(regs, &frame->uc.uc_mcontext)) goto badframe; if (compat_restore_altstack(&frame->uc.uc_stack)) goto badframe; - return ax; + return regs->ax; badframe: signal_fault(regs, frame, "x32 rt_sigreturn"); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index febc6aa..50e547e 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -77,9 +77,6 @@ #include <asm/realmode.h> #include <asm/misc.h> -/* State of each CPU */ -DEFINE_PER_CPU(int, cpu_state) = { 0 }; - /* Number of siblings per CPU package */ int smp_num_siblings = 1; EXPORT_SYMBOL(smp_num_siblings); @@ -257,7 +254,7 @@ static void notrace start_secondary(void *unused) lock_vector_lock(); set_cpu_online(smp_processor_id(), true); unlock_vector_lock(); - per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE; + cpu_set_state_online(smp_processor_id()); x86_platform.nmi_init(); /* enable local interrupts */ @@ -779,6 +776,26 @@ out: return boot_error; } +void common_cpu_up(unsigned int cpu, struct task_struct *idle) +{ + /* Just in case we booted with a single CPU. */ + alternatives_enable_smp(); + + per_cpu(current_task, cpu) = idle; + +#ifdef CONFIG_X86_32 + /* Stack for startup_32 can be just as for start_secondary onwards */ + irq_ctx_init(cpu); + per_cpu(cpu_current_top_of_stack, cpu) = + (unsigned long)task_stack_page(idle) + THREAD_SIZE; +#else + clear_tsk_thread_flag(idle, TIF_FORK); + initial_gs = per_cpu_offset(cpu); +#endif + per_cpu(kernel_stack, cpu) = + (unsigned long)task_stack_page(idle) + THREAD_SIZE; +} + /* * NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad * (ie clustered apic addressing mode), this is a LOGICAL apic ID. @@ -796,23 +813,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) int cpu0_nmi_registered = 0; unsigned long timeout; - /* Just in case we booted with a single CPU. */ - alternatives_enable_smp(); - idle->thread.sp = (unsigned long) (((struct pt_regs *) (THREAD_SIZE + task_stack_page(idle))) - 1); - per_cpu(current_task, cpu) = idle; -#ifdef CONFIG_X86_32 - /* Stack for startup_32 can be just as for start_secondary onwards */ - irq_ctx_init(cpu); -#else - clear_tsk_thread_flag(idle, TIF_FORK); - initial_gs = per_cpu_offset(cpu); -#endif - per_cpu(kernel_stack, cpu) = - (unsigned long)task_stack_page(idle) - - KERNEL_STACK_OFFSET + THREAD_SIZE; early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu); initial_code = (unsigned long)start_secondary; stack_start = idle->thread.sp; @@ -948,11 +951,16 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle) */ mtrr_save_state(); - per_cpu(cpu_state, cpu) = CPU_UP_PREPARE; + /* x86 CPUs take themselves offline, so delayed offline is OK. */ + err = cpu_check_up_prepare(cpu); + if (err && err != -EBUSY) + return err; /* the FPU context is blank, nobody can own it */ __cpu_disable_lazy_restore(cpu); + common_cpu_up(cpu, tidle); + err = do_boot_cpu(apicid, cpu, tidle); if (err) { pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu); @@ -1086,8 +1094,6 @@ static int __init smp_sanity_check(unsigned max_cpus) return SMP_NO_APIC; } - verify_local_APIC(); - /* * If SMP should be disabled, then really disable it! */ @@ -1191,7 +1197,7 @@ void __init native_smp_prepare_boot_cpu(void) switch_to_new_gdt(me); /* already set me in cpu_online_mask in boot_cpu_init() */ cpumask_set_cpu(me, cpu_callout_mask); - per_cpu(cpu_state, me) = CPU_ONLINE; + cpu_set_state_online(me); } void __init native_smp_cpus_done(unsigned int max_cpus) @@ -1318,14 +1324,10 @@ static void __ref remove_cpu_from_maps(int cpu) numa_remove_cpu(cpu); } -static DEFINE_PER_CPU(struct completion, die_complete); - void cpu_disable_common(void) { int cpu = smp_processor_id(); - init_completion(&per_cpu(die_complete, smp_processor_id())); - remove_siblinginfo(cpu); /* It's now safe to remove this processor from the online map */ @@ -1349,24 +1351,27 @@ int native_cpu_disable(void) return 0; } -void cpu_die_common(unsigned int cpu) +int common_cpu_die(unsigned int cpu) { - wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ); -} + int ret = 0; -void native_cpu_die(unsigned int cpu) -{ /* We don't do anything here: idle task is faking death itself. */ - cpu_die_common(cpu); - /* They ack this in play_dead() by setting CPU_DEAD */ - if (per_cpu(cpu_state, cpu) == CPU_DEAD) { + if (cpu_wait_death(cpu, 5)) { if (system_state == SYSTEM_RUNNING) pr_info("CPU %u is now offline\n", cpu); } else { pr_err("CPU %u didn't die...\n", cpu); + ret = -1; } + + return ret; +} + +void native_cpu_die(unsigned int cpu) +{ + common_cpu_die(cpu); } void play_dead_common(void) @@ -1375,10 +1380,8 @@ void play_dead_common(void) reset_lazy_tlbstate(); amd_e400_remove_cpu(raw_smp_processor_id()); - mb(); /* Ack it */ - __this_cpu_write(cpu_state, CPU_DEAD); - complete(&per_cpu(die_complete, smp_processor_id())); + (void)cpu_report_death(); /* * With physical CPU hotplug, we should halt the cpu diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c index 30277e2..10e0272 100644 --- a/arch/x86/kernel/sys_x86_64.c +++ b/arch/x86/kernel/sys_x86_64.c @@ -34,10 +34,26 @@ static unsigned long get_align_mask(void) return va_align.mask; } +/* + * To avoid aliasing in the I$ on AMD F15h, the bits defined by the + * va_align.bits, [12:upper_bit), are set to a random value instead of + * zeroing them. This random value is computed once per boot. This form + * of ASLR is known as "per-boot ASLR". + * + * To achieve this, the random value is added to the info.align_offset + * value before calling vm_unmapped_area() or ORed directly to the + * address. + */ +static unsigned long get_align_bits(void) +{ + return va_align.bits & get_align_mask(); +} + unsigned long align_vdso_addr(unsigned long addr) { unsigned long align_mask = get_align_mask(); - return (addr + align_mask) & ~align_mask; + addr = (addr + align_mask) & ~align_mask; + return addr | get_align_bits(); } static int __init control_va_addr_alignment(char *str) @@ -135,8 +151,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr, info.length = len; info.low_limit = begin; info.high_limit = end; - info.align_mask = filp ? get_align_mask() : 0; + info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; + if (filp) { + info.align_mask = get_align_mask(); + info.align_offset += get_align_bits(); + } return vm_unmapped_area(&info); } @@ -174,8 +194,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, info.length = len; info.low_limit = PAGE_SIZE; info.high_limit = mm->mmap_base; - info.align_mask = filp ? get_align_mask() : 0; + info.align_mask = 0; info.align_offset = pgoff << PAGE_SHIFT; + if (filp) { + info.align_mask = get_align_mask(); + info.align_offset += get_align_bits(); + } addr = vm_unmapped_area(&info); if (!(addr & ~PAGE_MASK)) return addr; diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c index e9bcd57..3777189 100644 --- a/arch/x86/kernel/syscall_32.c +++ b/arch/x86/kernel/syscall_32.c @@ -5,21 +5,29 @@ #include <linux/cache.h> #include <asm/asm-offsets.h> -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; +#ifdef CONFIG_IA32_EMULATION +#define SYM(sym, compat) compat +#else +#define SYM(sym, compat) sym +#define ia32_sys_call_table sys_call_table +#define __NR_ia32_syscall_max __NR_syscall_max +#endif + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; #include <asm/syscalls_32.h> #undef __SYSCALL_I386 -#define __SYSCALL_I386(nr, sym, compat) [nr] = sym, +#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage void sys_ni_syscall(void); -__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { +__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, + [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, #include <asm/syscalls_32.h> }; diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c index b79133a..5ecbfe5 100644 --- a/arch/x86/kernel/test_rodata.c +++ b/arch/x86/kernel/test_rodata.c @@ -57,7 +57,7 @@ int rodata_test(void) /* test 3: check the value hasn't changed */ /* If this test fails, we managed to overwrite the data */ if (!rodata_test_data) { - printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n"); + printk(KERN_ERR "rodata_test: Test 3 fails (end data)\n"); return -ENODEV; } /* test 4: check if the rodata section is 4Kb aligned */ diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c index 25adc0e..d39c091 100644 --- a/arch/x86/kernel/time.c +++ b/arch/x86/kernel/time.c @@ -30,7 +30,7 @@ unsigned long profile_pc(struct pt_regs *regs) { unsigned long pc = instruction_pointer(regs); - if (!user_mode_vm(regs) && in_lock_functions(pc)) { + if (!user_mode(regs) && in_lock_functions(pc)) { #ifdef CONFIG_FRAME_POINTER return *(unsigned long *)(regs->bp + sizeof(long)); #else diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4ff5d16..324ab52 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -112,7 +112,7 @@ enum ctx_state ist_enter(struct pt_regs *regs) { enum ctx_state prev_state; - if (user_mode_vm(regs)) { + if (user_mode(regs)) { /* Other than that, we're just an exception. */ prev_state = exception_enter(); } else { @@ -123,7 +123,7 @@ enum ctx_state ist_enter(struct pt_regs *regs) * but we need to notify RCU. */ rcu_nmi_enter(); - prev_state = IN_KERNEL; /* the value is irrelevant. */ + prev_state = CONTEXT_KERNEL; /* the value is irrelevant. */ } /* @@ -146,7 +146,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) /* Must be before exception_exit. */ preempt_count_sub(HARDIRQ_OFFSET); - if (user_mode_vm(regs)) + if (user_mode(regs)) return exception_exit(prev_state); else rcu_nmi_exit(); @@ -158,7 +158,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) * * IST exception handlers normally cannot schedule. As a special * exception, if the exception interrupted userspace code (i.e. - * user_mode_vm(regs) would return true) and the exception was not + * user_mode(regs) would return true) and the exception was not * a double fault, it can be safe to schedule. ist_begin_non_atomic() * begins a non-atomic section within an ist_enter()/ist_exit() region. * Callers are responsible for enabling interrupts themselves inside @@ -167,15 +167,15 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state) */ void ist_begin_non_atomic(struct pt_regs *regs) { - BUG_ON(!user_mode_vm(regs)); + BUG_ON(!user_mode(regs)); /* * Sanity check: we need to be on the normal thread stack. This * will catch asm bugs and any attempt to use ist_preempt_enable * from double_fault. */ - BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack)) - & ~(THREAD_SIZE - 1)) != 0); + BUG_ON((unsigned long)(current_top_of_stack() - + current_stack_pointer()) >= THREAD_SIZE); preempt_count_sub(HARDIRQ_OFFSET); } @@ -194,8 +194,7 @@ static nokprobe_inline int do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, struct pt_regs *regs, long error_code) { -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) { + if (v8086_mode(regs)) { /* * Traps 0, 1, 3, 4, and 5 should be forwarded to vm86. * On nmi (interrupt 2), do_trap should not be called. @@ -207,7 +206,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str, } return -1; } -#endif + if (!user_mode(regs)) { if (!fixup_exception(regs)) { tsk->thread.error_code = error_code; @@ -384,7 +383,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code) goto exit; conditional_sti(regs); - if (!user_mode_vm(regs)) + if (!user_mode(regs)) die("bounds", regs, error_code); if (!cpu_feature_enabled(X86_FEATURE_MPX)) { @@ -462,13 +461,11 @@ do_general_protection(struct pt_regs *regs, long error_code) prev_state = exception_enter(); conditional_sti(regs); -#ifdef CONFIG_X86_32 - if (regs->flags & X86_VM_MASK) { + if (v8086_mode(regs)) { local_irq_enable(); handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code); goto exit; } -#endif tsk = current; if (!user_mode(regs)) { @@ -587,7 +584,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s) /* Copy the remainder of the stack from the current stack. */ memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip)); - BUG_ON(!user_mode_vm(&new_stack->regs)); + BUG_ON(!user_mode(&new_stack->regs)); return new_stack; } NOKPROBE_SYMBOL(fixup_bad_iret); @@ -637,7 +634,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) * then it's very likely the result of an icebp/int01 trap. * User wants a sigtrap for that. */ - if (!dr6 && user_mode_vm(regs)) + if (!dr6 && user_mode(regs)) user_icebp = 1; /* Catch kmemcheck conditions first of all! */ @@ -673,7 +670,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code) /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); - if (regs->flags & X86_VM_MASK) { + if (v8086_mode(regs)) { handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, X86_TRAP_DB); preempt_conditional_cli(regs); @@ -721,7 +718,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) return; conditional_sti(regs); - if (!user_mode_vm(regs)) + if (!user_mode(regs)) { if (!fixup_exception(regs)) { task->thread.error_code = error_code; @@ -734,7 +731,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr) /* * Save the info for the exception handler and clear the error. */ - save_init_fpu(task); + unlazy_fpu(task); task->thread.trap_nr = trapnr; task->thread.error_code = error_code; info.si_signo = SIGFPE; @@ -863,7 +860,7 @@ void math_state_restore(void) kernel_fpu_disable(); __thread_fpu_begin(tsk); if (unlikely(restore_fpu_checking(tsk))) { - drop_init_fpu(tsk); + fpu_reset_state(tsk); force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk); } else { tsk->thread.fpu_counter++; @@ -925,9 +922,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) /* Set of traps needed for early debugging. */ void __init early_trap_init(void) { - set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); + /* + * Don't use IST to set DEBUG_STACK as it doesn't work until TSS + * is ready in cpu_init() <-- trap_init(). Before trap_init(), + * CPU runs at ring 0 so it is impossible to hit an invalid + * stack. Using the original stack works well enough at this + * early stage. DEBUG_STACK will be equipped after cpu_init() in + * trap_init(). + * + * We don't need to set trace_idt_table like set_intr_gate(), + * since we don't have trace_debug and it will be reset to + * 'debug' in trap_init() by set_intr_gate_ist(). + */ + set_intr_gate_notrace(X86_TRAP_DB, debug); /* int3 can be called from all */ - set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); + set_system_intr_gate(X86_TRAP_BP, &int3); #ifdef CONFIG_X86_32 set_intr_gate(X86_TRAP_PF, page_fault); #endif @@ -1005,6 +1014,15 @@ void __init trap_init(void) */ cpu_init(); + /* + * X86_TRAP_DB and X86_TRAP_BP have been set + * in early_trap_init(). However, ITS works only after + * cpu_init() loads TSS. See comments in early_trap_init(). + */ + set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); + /* int3 can be called from all */ + set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); + x86_init.irqs.trap_init(); #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c index 81f8adb0..0b81ad6 100644 --- a/arch/x86/kernel/uprobes.c +++ b/arch/x86/kernel/uprobes.c @@ -912,7 +912,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, int ret = NOTIFY_DONE; /* We are only interested in userspace traps */ - if (regs && !user_mode_vm(regs)) + if (regs && !user_mode(regs)) return NOTIFY_DONE; switch (val) { diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index e8edcf5..fc9db6e 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -150,7 +150,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) do_exit(SIGSEGV); } - tss = &per_cpu(init_tss, get_cpu()); + tss = &per_cpu(cpu_tss, get_cpu()); current->thread.sp0 = current->thread.saved_sp0; current->thread.sysenter_cs = __KERNEL_CS; load_sp0(tss, ¤t->thread); @@ -318,7 +318,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk tsk->thread.saved_fs = info->regs32->fs; tsk->thread.saved_gs = get_user_gs(info->regs32); - tss = &per_cpu(init_tss, get_cpu()); + tss = &per_cpu(cpu_tss, get_cpu()); tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; if (cpu_has_sep) tsk->thread.sysenter_cs = 0; diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c index cdc6cf9..87a815b 100644 --- a/arch/x86/kernel/xsave.c +++ b/arch/x86/kernel/xsave.c @@ -342,7 +342,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) config_enabled(CONFIG_IA32_EMULATION)); if (!buf) { - drop_init_fpu(tsk); + fpu_reset_state(tsk); return 0; } @@ -416,7 +416,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size) */ user_fpu_begin(); if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) { - drop_init_fpu(tsk); + fpu_reset_state(tsk); return -1; } } @@ -678,19 +678,13 @@ void xsave_init(void) this_func(); } -static inline void __init eager_fpu_init_bp(void) -{ - current->thread.fpu.state = - alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct)); - if (!init_xstate_buf) - setup_init_fpu_buf(); -} - -void eager_fpu_init(void) +/* + * setup_init_fpu_buf() is __init and it is OK to call it here because + * init_xstate_buf will be unset only once during boot. + */ +void __init_refok eager_fpu_init(void) { - static __refdata void (*boot_func)(void) = eager_fpu_init_bp; - - clear_used_math(); + WARN_ON(used_math()); current_thread_info()->status = 0; if (eagerfpu == ENABLE) @@ -701,21 +695,8 @@ void eager_fpu_init(void) return; } - if (boot_func) { - boot_func(); - boot_func = NULL; - } - - /* - * This is same as math_state_restore(). But use_xsave() is - * not yet patched to use math_state_restore(). - */ - init_fpu(current); - __thread_fpu_begin(current); - if (cpu_has_xsave) - xrstor_state(init_xstate_buf, -1); - else - fxrstor_checking(&init_xstate_buf->i387); + if (!init_xstate_buf) + setup_init_fpu_buf(); } /* |