summaryrefslogtreecommitdiffstats
path: root/arch/x86/kernel
diff options
context:
space:
mode:
Diffstat (limited to 'arch/x86/kernel')
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/alternative.c163
-rw-r--r--arch/x86/kernel/apic/apic.c62
-rw-r--r--arch/x86/kernel/apic/x2apic_cluster.c8
-rw-r--r--arch/x86/kernel/apic/x2apic_uv_x.c89
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/amd.c9
-rw-r--r--arch/x86/kernel/cpu/common.c87
-rw-r--r--arch/x86/kernel/cpu/intel_cacheinfo.c715
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-internal.h11
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce-severity.c66
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce.c154
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_amd.c11
-rw-r--r--arch/x86/kernel/cpu/mcheck/mce_intel.c63
-rw-r--r--arch/x86/kernel/cpu/microcode/amd.c1
-rw-r--r--arch/x86/kernel/cpu/microcode/core_early.c75
-rw-r--r--arch/x86/kernel/cpu/microcode/intel.c4
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_early.c345
-rw-r--r--arch/x86/kernel/cpu/microcode/intel_lib.c22
-rw-r--r--arch/x86/kernel/cpu/mkcapflags.sh2
-rw-r--r--arch/x86/kernel/cpu/perf_event.c18
-rw-r--r--arch/x86/kernel/crash.c2
-rw-r--r--arch/x86/kernel/devicetree.c4
-rw-r--r--arch/x86/kernel/dumpstack.c15
-rw-r--r--arch/x86/kernel/dumpstack_32.c13
-rw-r--r--arch/x86/kernel/dumpstack_64.c11
-rw-r--r--arch/x86/kernel/e820.c2
-rw-r--r--arch/x86/kernel/early_printk.c32
-rw-r--r--arch/x86/kernel/entry_32.S93
-rw-r--r--arch/x86/kernel/entry_64.S964
-rw-r--r--arch/x86/kernel/head64.c3
-rw-r--r--arch/x86/kernel/head_32.S3
-rw-r--r--arch/x86/kernel/head_64.S6
-rw-r--r--arch/x86/kernel/i387.c56
-rw-r--r--arch/x86/kernel/ioport.c2
-rw-r--r--arch/x86/kernel/irq.c4
-rw-r--r--arch/x86/kernel/irq_32.c2
-rw-r--r--arch/x86/kernel/irq_64.c2
-rw-r--r--arch/x86/kernel/irqinit.c3
-rw-r--r--arch/x86/kernel/kgdb.c6
-rw-r--r--arch/x86/kernel/kprobes/core.c4
-rw-r--r--arch/x86/kernel/module.c11
-rw-r--r--arch/x86/kernel/perf_regs.c40
-rw-r--r--arch/x86/kernel/process.c106
-rw-r--r--arch/x86/kernel/process_32.c27
-rw-r--r--arch/x86/kernel/process_64.c24
-rw-r--r--arch/x86/kernel/ptrace.c12
-rw-r--r--arch/x86/kernel/pvclock.c44
-rw-r--r--arch/x86/kernel/reboot.c10
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S8
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S16
-rw-r--r--arch/x86/kernel/setup.c21
-rw-r--r--arch/x86/kernel/signal.c52
-rw-r--r--arch/x86/kernel/smpboot.c77
-rw-r--r--arch/x86/kernel/sys_x86_64.c30
-rw-r--r--arch/x86/kernel/syscall_32.c16
-rw-r--r--arch/x86/kernel/test_rodata.c2
-rw-r--r--arch/x86/kernel/time.c2
-rw-r--r--arch/x86/kernel/traps.c62
-rw-r--r--arch/x86/kernel/uprobes.c2
-rw-r--r--arch/x86/kernel/vm86_32.c4
-rw-r--r--arch/x86/kernel/xsave.c39
63 files changed, 1799 insertions, 1942 deletions
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index cdb1b70..c887cd9 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-$(CONFIG_X86_64) += mcount_64.o
obj-y += syscall_$(BITS).o vsyscall_gtod.o
+obj-$(CONFIG_IA32_EMULATION) += syscall_32.o
obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o
obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-$(CONFIG_SYSFS) += ksysfs.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 703130f..aef6531 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str)
__setup("noreplace-paravirt", setup_noreplace_paravirt);
#endif
-#define DPRINTK(fmt, ...) \
-do { \
- if (debug_alternative) \
- printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+#define DPRINTK(fmt, args...) \
+do { \
+ if (debug_alternative) \
+ printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
+} while (0)
+
+#define DUMP_BYTES(buf, len, fmt, args...) \
+do { \
+ if (unlikely(debug_alternative)) { \
+ int j; \
+ \
+ if (!(len)) \
+ break; \
+ \
+ printk(KERN_DEBUG fmt, ##args); \
+ for (j = 0; j < (len) - 1; j++) \
+ printk(KERN_CONT "%02hhx ", buf[j]); \
+ printk(KERN_CONT "%02hhx\n", buf[j]); \
+ } \
} while (0)
/*
@@ -243,12 +258,89 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
void *text_poke_early(void *addr, const void *opcode, size_t len);
-/* Replace instructions with better alternatives for this CPU type.
- This runs before SMP is initialized to avoid SMP problems with
- self modifying code. This implies that asymmetric systems where
- APs have less capabilities than the boot processor are not handled.
- Tough. Make sure you disable such features by hand. */
+/*
+ * Are we looking at a near JMP with a 1 or 4-byte displacement.
+ */
+static inline bool is_jmp(const u8 opcode)
+{
+ return opcode == 0xeb || opcode == 0xe9;
+}
+
+static void __init_or_module
+recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
+{
+ u8 *next_rip, *tgt_rip;
+ s32 n_dspl, o_dspl;
+ int repl_len;
+
+ if (a->replacementlen != 5)
+ return;
+
+ o_dspl = *(s32 *)(insnbuf + 1);
+
+ /* next_rip of the replacement JMP */
+ next_rip = repl_insn + a->replacementlen;
+ /* target rip of the replacement JMP */
+ tgt_rip = next_rip + o_dspl;
+ n_dspl = tgt_rip - orig_insn;
+
+ DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
+
+ if (tgt_rip - orig_insn >= 0) {
+ if (n_dspl - 2 <= 127)
+ goto two_byte_jmp;
+ else
+ goto five_byte_jmp;
+ /* negative offset */
+ } else {
+ if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
+ goto two_byte_jmp;
+ else
+ goto five_byte_jmp;
+ }
+
+two_byte_jmp:
+ n_dspl -= 2;
+
+ insnbuf[0] = 0xeb;
+ insnbuf[1] = (s8)n_dspl;
+ add_nops(insnbuf + 2, 3);
+
+ repl_len = 2;
+ goto done;
+
+five_byte_jmp:
+ n_dspl -= 5;
+
+ insnbuf[0] = 0xe9;
+ *(s32 *)&insnbuf[1] = n_dspl;
+ repl_len = 5;
+
+done:
+
+ DPRINTK("final displ: 0x%08x, JMP 0x%lx",
+ n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
+}
+
+static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
+{
+ if (instr[0] != 0x90)
+ return;
+
+ add_nops(instr + (a->instrlen - a->padlen), a->padlen);
+
+ DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
+ instr, a->instrlen - a->padlen, a->padlen);
+}
+
+/*
+ * Replace instructions with better alternatives for this CPU type. This runs
+ * before SMP is initialized to avoid SMP problems with self modifying code.
+ * This implies that asymmetric systems where APs have less capabilities than
+ * the boot processor are not handled. Tough. Make sure you disable such
+ * features by hand.
+ */
void __init_or_module apply_alternatives(struct alt_instr *start,
struct alt_instr *end)
{
@@ -256,10 +348,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
u8 *instr, *replacement;
u8 insnbuf[MAX_PATCH_LEN];
- DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
+ DPRINTK("alt table %p -> %p", start, end);
/*
* The scan order should be from start to end. A later scanned
- * alternative code can overwrite a previous scanned alternative code.
+ * alternative code can overwrite previously scanned alternative code.
* Some kernel functions (e.g. memcpy, memset, etc) use this order to
* patch code.
*
@@ -267,29 +359,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
* order.
*/
for (a = start; a < end; a++) {
+ int insnbuf_sz = 0;
+
instr = (u8 *)&a->instr_offset + a->instr_offset;
replacement = (u8 *)&a->repl_offset + a->repl_offset;
- BUG_ON(a->replacementlen > a->instrlen);
BUG_ON(a->instrlen > sizeof(insnbuf));
BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
- if (!boot_cpu_has(a->cpuid))
+ if (!boot_cpu_has(a->cpuid)) {
+ if (a->padlen > 1)
+ optimize_nops(a, instr);
+
continue;
+ }
+
+ DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d), pad: %d",
+ a->cpuid >> 5,
+ a->cpuid & 0x1f,
+ instr, a->instrlen,
+ replacement, a->replacementlen, a->padlen);
+
+ DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
+ DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
memcpy(insnbuf, replacement, a->replacementlen);
+ insnbuf_sz = a->replacementlen;
/* 0xe8 is a relative jump; fix the offset. */
- if (*insnbuf == 0xe8 && a->replacementlen == 5)
- *(s32 *)(insnbuf + 1) += replacement - instr;
+ if (*insnbuf == 0xe8 && a->replacementlen == 5) {
+ *(s32 *)(insnbuf + 1) += replacement - instr;
+ DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
+ *(s32 *)(insnbuf + 1),
+ (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
+ }
+
+ if (a->replacementlen && is_jmp(replacement[0]))
+ recompute_jump(a, instr, replacement, insnbuf);
- add_nops(insnbuf + a->replacementlen,
- a->instrlen - a->replacementlen);
+ if (a->instrlen > a->replacementlen) {
+ add_nops(insnbuf + a->replacementlen,
+ a->instrlen - a->replacementlen);
+ insnbuf_sz += a->instrlen - a->replacementlen;
+ }
+ DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
- text_poke_early(instr, insnbuf, a->instrlen);
+ text_poke_early(instr, insnbuf, insnbuf_sz);
}
}
#ifdef CONFIG_SMP
-
static void alternatives_smp_lock(const s32 *start, const s32 *end,
u8 *text, u8 *text_end)
{
@@ -371,8 +488,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
smp->locks_end = locks_end;
smp->text = text;
smp->text_end = text_end;
- DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
- __func__, smp->locks, smp->locks_end,
+ DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
+ smp->locks, smp->locks_end,
smp->text, smp->text_end, smp->name);
list_add_tail(&smp->next, &smp_alt_modules);
@@ -440,7 +557,7 @@ int alternatives_text_reserved(void *start, void *end)
return 0;
}
-#endif
+#endif /* CONFIG_SMP */
#ifdef CONFIG_PARAVIRT
void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
@@ -601,7 +718,7 @@ int poke_int3_handler(struct pt_regs *regs)
if (likely(!bp_patching_in_progress))
return 0;
- if (user_mode_vm(regs) || regs->ip != (unsigned long)bp_int3_addr)
+ if (user_mode(regs) || regs->ip != (unsigned long)bp_int3_addr)
return 0;
/* set up the specified breakpoint handler */
diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c
index ad3639a..dcb5285 100644
--- a/arch/x86/kernel/apic/apic.c
+++ b/arch/x86/kernel/apic/apic.c
@@ -1084,67 +1084,6 @@ void lapic_shutdown(void)
local_irq_restore(flags);
}
-/*
- * This is to verify that we're looking at a real local APIC.
- * Check these against your board if the CPUs aren't getting
- * started for no apparent reason.
- */
-int __init verify_local_APIC(void)
-{
- unsigned int reg0, reg1;
-
- /*
- * The version register is read-only in a real APIC.
- */
- reg0 = apic_read(APIC_LVR);
- apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg0);
- apic_write(APIC_LVR, reg0 ^ APIC_LVR_MASK);
- reg1 = apic_read(APIC_LVR);
- apic_printk(APIC_DEBUG, "Getting VERSION: %x\n", reg1);
-
- /*
- * The two version reads above should print the same
- * numbers. If the second one is different, then we
- * poke at a non-APIC.
- */
- if (reg1 != reg0)
- return 0;
-
- /*
- * Check if the version looks reasonably.
- */
- reg1 = GET_APIC_VERSION(reg0);
- if (reg1 == 0x00 || reg1 == 0xff)
- return 0;
- reg1 = lapic_get_maxlvt();
- if (reg1 < 0x02 || reg1 == 0xff)
- return 0;
-
- /*
- * The ID register is read/write in a real APIC.
- */
- reg0 = apic_read(APIC_ID);
- apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg0);
- apic_write(APIC_ID, reg0 ^ apic->apic_id_mask);
- reg1 = apic_read(APIC_ID);
- apic_printk(APIC_DEBUG, "Getting ID: %x\n", reg1);
- apic_write(APIC_ID, reg0);
- if (reg1 != (reg0 ^ apic->apic_id_mask))
- return 0;
-
- /*
- * The next two are just to see if we have sane values.
- * They're only really relevant if we're in Virtual Wire
- * compatibility mode, but most boxes are anymore.
- */
- reg0 = apic_read(APIC_LVT0);
- apic_printk(APIC_DEBUG, "Getting LVT0: %x\n", reg0);
- reg1 = apic_read(APIC_LVT1);
- apic_printk(APIC_DEBUG, "Getting LVT1: %x\n", reg1);
-
- return 1;
-}
-
/**
* sync_Arb_IDs - synchronize APIC bus arbitration IDs
*/
@@ -2283,7 +2222,6 @@ int __init APIC_init_uniprocessor(void)
disable_ioapic_support();
default_setup_apic_routing();
- verify_local_APIC();
apic_bsp_setup(true);
return 0;
}
diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c
index e658f21..d9d0bd2 100644
--- a/arch/x86/kernel/apic/x2apic_cluster.c
+++ b/arch/x86/kernel/apic/x2apic_cluster.c
@@ -135,12 +135,12 @@ static void init_x2apic_ldr(void)
per_cpu(x86_cpu_to_logical_apicid, this_cpu) = apic_read(APIC_LDR);
- __cpu_set(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
+ cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, this_cpu));
for_each_online_cpu(cpu) {
if (x2apic_cluster(this_cpu) != x2apic_cluster(cpu))
continue;
- __cpu_set(this_cpu, per_cpu(cpus_in_cluster, cpu));
- __cpu_set(cpu, per_cpu(cpus_in_cluster, this_cpu));
+ cpumask_set_cpu(this_cpu, per_cpu(cpus_in_cluster, cpu));
+ cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, this_cpu));
}
}
@@ -195,7 +195,7 @@ static int x2apic_init_cpu_notifier(void)
BUG_ON(!per_cpu(cpus_in_cluster, cpu) || !per_cpu(ipi_mask, cpu));
- __cpu_set(cpu, per_cpu(cpus_in_cluster, cpu));
+ cpumask_set_cpu(cpu, per_cpu(cpus_in_cluster, cpu));
register_hotcpu_notifier(&x2apic_cpu_notifier);
return 1;
}
diff --git a/arch/x86/kernel/apic/x2apic_uv_x.c b/arch/x86/kernel/apic/x2apic_uv_x.c
index 8e9dcfd..c8d9295 100644
--- a/arch/x86/kernel/apic/x2apic_uv_x.c
+++ b/arch/x86/kernel/apic/x2apic_uv_x.c
@@ -144,33 +144,60 @@ static void __init uv_set_apicid_hibit(void)
static int __init uv_acpi_madt_oem_check(char *oem_id, char *oem_table_id)
{
- int pnodeid, is_uv1, is_uv2, is_uv3;
-
- is_uv1 = !strcmp(oem_id, "SGI");
- is_uv2 = !strcmp(oem_id, "SGI2");
- is_uv3 = !strncmp(oem_id, "SGI3", 4); /* there are varieties of UV3 */
- if (is_uv1 || is_uv2 || is_uv3) {
- uv_hub_info->hub_revision =
- (is_uv1 ? UV1_HUB_REVISION_BASE :
- (is_uv2 ? UV2_HUB_REVISION_BASE :
- UV3_HUB_REVISION_BASE));
- pnodeid = early_get_pnodeid();
- early_get_apic_pnode_shift();
- x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
- x86_platform.nmi_init = uv_nmi_init;
- if (!strcmp(oem_table_id, "UVL"))
- uv_system_type = UV_LEGACY_APIC;
- else if (!strcmp(oem_table_id, "UVX"))
- uv_system_type = UV_X2APIC;
- else if (!strcmp(oem_table_id, "UVH")) {
- __this_cpu_write(x2apic_extra_bits,
- pnodeid << uvh_apicid.s.pnode_shift);
- uv_system_type = UV_NON_UNIQUE_APIC;
- uv_set_apicid_hibit();
- return 1;
- }
+ int pnodeid;
+ int uv_apic;
+
+ if (strncmp(oem_id, "SGI", 3) != 0)
+ return 0;
+
+ /*
+ * Determine UV arch type.
+ * SGI: UV100/1000
+ * SGI2: UV2000/3000
+ * SGI3: UV300 (truncated to 4 chars because of different varieties)
+ */
+ uv_hub_info->hub_revision =
+ !strncmp(oem_id, "SGI3", 4) ? UV3_HUB_REVISION_BASE :
+ !strcmp(oem_id, "SGI2") ? UV2_HUB_REVISION_BASE :
+ !strcmp(oem_id, "SGI") ? UV1_HUB_REVISION_BASE : 0;
+
+ if (uv_hub_info->hub_revision == 0)
+ goto badbios;
+
+ pnodeid = early_get_pnodeid();
+ early_get_apic_pnode_shift();
+ x86_platform.is_untracked_pat_range = uv_is_untracked_pat_range;
+ x86_platform.nmi_init = uv_nmi_init;
+
+ if (!strcmp(oem_table_id, "UVX")) { /* most common */
+ uv_system_type = UV_X2APIC;
+ uv_apic = 0;
+
+ } else if (!strcmp(oem_table_id, "UVH")) { /* only UV1 systems */
+ uv_system_type = UV_NON_UNIQUE_APIC;
+ __this_cpu_write(x2apic_extra_bits,
+ pnodeid << uvh_apicid.s.pnode_shift);
+ uv_set_apicid_hibit();
+ uv_apic = 1;
+
+ } else if (!strcmp(oem_table_id, "UVL")) { /* only used for */
+ uv_system_type = UV_LEGACY_APIC; /* very small systems */
+ uv_apic = 0;
+
+ } else {
+ goto badbios;
}
- return 0;
+
+ pr_info("UV: OEM IDs %s/%s, System/HUB Types %d/%d, uv_apic %d\n",
+ oem_id, oem_table_id, uv_system_type,
+ uv_min_hub_revision_id, uv_apic);
+
+ return uv_apic;
+
+badbios:
+ pr_err("UV: OEM_ID:%s OEM_TABLE_ID:%s\n", oem_id, oem_table_id);
+ pr_err("Current BIOS not supported, update kernel and/or BIOS\n");
+ BUG();
}
enum uv_system_type get_uv_system_type(void)
@@ -854,10 +881,14 @@ void __init uv_system_init(void)
unsigned long mmr_base, present, paddr;
unsigned short pnode_mask;
unsigned char n_lshift;
- char *hub = (is_uv1_hub() ? "UV1" :
- (is_uv2_hub() ? "UV2" :
- "UV3"));
+ char *hub = (is_uv1_hub() ? "UV100/1000" :
+ (is_uv2_hub() ? "UV2000/3000" :
+ (is_uv3_hub() ? "UV300" : NULL)));
+ if (!hub) {
+ pr_err("UV: Unknown/unsupported UV hub\n");
+ return;
+ }
pr_info("UV: Found %s hub\n", hub);
map_low_mmrs();
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 3b3b9d3..47703ae 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -68,7 +68,7 @@ void foo(void)
/* Offset from the sysenter stack to tss.sp0 */
DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
- sizeof(struct tss_struct));
+ offsetofend(struct tss_struct, SYSENTER_stack));
#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
BLANK();
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index fdcbb4d..5ce6f2d 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -81,6 +81,7 @@ int main(void)
#undef ENTRY
OFFSET(TSS_ist, tss_struct, x86_tss.ist);
+ OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
BLANK();
DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a220239..fd470eb 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -5,6 +5,7 @@
#include <linux/io.h>
#include <linux/sched.h>
+#include <linux/random.h>
#include <asm/processor.h>
#include <asm/apic.h>
#include <asm/cpu.h>
@@ -488,6 +489,9 @@ static void bsp_init_amd(struct cpuinfo_x86 *c)
va_align.mask = (upperbit - 1) & PAGE_MASK;
va_align.flags = ALIGN_VA_32 | ALIGN_VA_64;
+
+ /* A random value per boot for bit slice [12:upper_bit) */
+ va_align.bits = get_random_int() & va_align.mask;
}
}
@@ -711,6 +715,11 @@ static void init_amd(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
+ /* 3DNow or LM implies PREFETCHW */
+ if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
+ if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
+ set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 1cd4a1a..a62cf04 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -998,38 +998,37 @@ static void identify_cpu(struct cpuinfo_x86 *c)
#endif
}
-#ifdef CONFIG_X86_64
-#ifdef CONFIG_IA32_EMULATION
-/* May not be __init: called during resume */
-static void syscall32_cpu_init(void)
-{
- /* Load these always in case some future AMD CPU supports
- SYSENTER from compat mode too. */
- wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
- wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
- wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
-
- wrmsrl(MSR_CSTAR, ia32_cstar_target);
-}
-#endif /* CONFIG_IA32_EMULATION */
-#endif /* CONFIG_X86_64 */
-
+/*
+ * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions
+ * on 32-bit kernels:
+ */
#ifdef CONFIG_X86_32
void enable_sep_cpu(void)
{
- int cpu = get_cpu();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
+ struct tss_struct *tss;
+ int cpu;
- if (!boot_cpu_has(X86_FEATURE_SEP)) {
- put_cpu();
- return;
- }
+ cpu = get_cpu();
+ tss = &per_cpu(cpu_tss, cpu);
+
+ if (!boot_cpu_has(X86_FEATURE_SEP))
+ goto out;
+
+ /*
+ * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field --
+ * see the big comment in struct x86_hw_tss's definition.
+ */
tss->x86_tss.ss1 = __KERNEL_CS;
- tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss;
- wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
- wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
- wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
+ wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
+
+ wrmsr(MSR_IA32_SYSENTER_ESP,
+ (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack),
+ 0);
+
+ wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0);
+
+out:
put_cpu();
}
#endif
@@ -1157,7 +1156,7 @@ static __init int setup_disablecpuid(char *arg)
__setup("clearcpuid=", setup_disablecpuid);
DEFINE_PER_CPU(unsigned long, kernel_stack) =
- (unsigned long)&init_thread_union - KERNEL_STACK_OFFSET + THREAD_SIZE;
+ (unsigned long)&init_thread_union + THREAD_SIZE;
EXPORT_PER_CPU_SYMBOL(kernel_stack);
#ifdef CONFIG_X86_64
@@ -1169,8 +1168,8 @@ DEFINE_PER_CPU_FIRST(union irq_stack_union,
irq_stack_union) __aligned(PAGE_SIZE) __visible;
/*
- * The following four percpu variables are hot. Align current_task to
- * cacheline size such that all four fall in the same cacheline.
+ * The following percpu variables are hot. Align current_task to
+ * cacheline size such that they fall in the same cacheline.
*/
DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
&init_task;
@@ -1210,10 +1209,23 @@ void syscall_init(void)
*/
wrmsrl(MSR_STAR, ((u64)__USER32_CS)<<48 | ((u64)__KERNEL_CS)<<32);
wrmsrl(MSR_LSTAR, system_call);
- wrmsrl(MSR_CSTAR, ignore_sysret);
#ifdef CONFIG_IA32_EMULATION
- syscall32_cpu_init();
+ wrmsrl(MSR_CSTAR, ia32_cstar_target);
+ /*
+ * This only works on Intel CPUs.
+ * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
+ * This does not cause SYSENTER to jump to the wrong location, because
+ * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
+ */
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
+#else
+ wrmsrl(MSR_CSTAR, ignore_sysret);
+ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
+ wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
+ wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
#endif
/* Flags to clear on syscall */
@@ -1265,6 +1277,15 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+/*
+ * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
+ * the top of the kernel stack. Use an extra percpu variable to track the
+ * top of the kernel stack directly.
+ */
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
+ (unsigned long)&init_thread_union + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+
#ifdef CONFIG_CC_STACKPROTECTOR
DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
#endif
@@ -1346,7 +1367,7 @@ void cpu_init(void)
*/
load_ucode_ap();
- t = &per_cpu(init_tss, cpu);
+ t = &per_cpu(cpu_tss, cpu);
oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA
@@ -1430,7 +1451,7 @@ void cpu_init(void)
{
int cpu = smp_processor_id();
struct task_struct *curr = current;
- struct tss_struct *t = &per_cpu(init_tss, cpu);
+ struct tss_struct *t = &per_cpu(cpu_tss, cpu);
struct thread_struct *thread = &curr->thread;
wait_for_master_cpu(cpu);
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index 6596433..edcb0e2 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -7,16 +7,14 @@
* Andi Kleen / Andreas Herrmann : CPUID4 emulation on AMD.
*/
-#include <linux/init.h>
#include <linux/slab.h>
-#include <linux/device.h>
-#include <linux/compiler.h>
+#include <linux/cacheinfo.h>
#include <linux/cpu.h>
#include <linux/sched.h>
+#include <linux/sysfs.h>
#include <linux/pci.h>
#include <asm/processor.h>
-#include <linux/smp.h>
#include <asm/amd_nb.h>
#include <asm/smp.h>
@@ -116,10 +114,10 @@ static const struct _cache_table cache_table[] =
enum _cache_type {
- CACHE_TYPE_NULL = 0,
- CACHE_TYPE_DATA = 1,
- CACHE_TYPE_INST = 2,
- CACHE_TYPE_UNIFIED = 3
+ CTYPE_NULL = 0,
+ CTYPE_DATA = 1,
+ CTYPE_INST = 2,
+ CTYPE_UNIFIED = 3
};
union _cpuid4_leaf_eax {
@@ -159,11 +157,6 @@ struct _cpuid4_info_regs {
struct amd_northbridge *nb;
};
-struct _cpuid4_info {
- struct _cpuid4_info_regs base;
- DECLARE_BITMAP(shared_cpu_map, NR_CPUS);
-};
-
unsigned short num_cache_leaves;
/* AMD doesn't have CPUID4. Emulate it here to report the same
@@ -220,6 +213,13 @@ static const unsigned short assocs[] = {
static const unsigned char levels[] = { 1, 1, 2, 3 };
static const unsigned char types[] = { 1, 2, 3, 3 };
+static const enum cache_type cache_type_map[] = {
+ [CTYPE_NULL] = CACHE_TYPE_NOCACHE,
+ [CTYPE_DATA] = CACHE_TYPE_DATA,
+ [CTYPE_INST] = CACHE_TYPE_INST,
+ [CTYPE_UNIFIED] = CACHE_TYPE_UNIFIED,
+};
+
static void
amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
union _cpuid4_leaf_ebx *ebx,
@@ -291,14 +291,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax,
(ebx->split.ways_of_associativity + 1) - 1;
}
-struct _cache_attr {
- struct attribute attr;
- ssize_t (*show)(struct _cpuid4_info *, char *, unsigned int);
- ssize_t (*store)(struct _cpuid4_info *, const char *, size_t count,
- unsigned int);
-};
-
#if defined(CONFIG_AMD_NB) && defined(CONFIG_SYSFS)
+
/*
* L3 cache descriptors
*/
@@ -325,20 +319,6 @@ static void amd_calc_l3_indices(struct amd_northbridge *nb)
l3->indices = (max(max3(sc0, sc1, sc2), sc3) << 10) - 1;
}
-static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
-{
- int node;
-
- /* only for L3, and not in virtualized environments */
- if (index < 3)
- return;
-
- node = amd_get_nb_id(smp_processor_id());
- this_leaf->nb = node_to_amd_nb(node);
- if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
- amd_calc_l3_indices(this_leaf->nb);
-}
-
/*
* check whether a slot used for disabling an L3 index is occupied.
* @l3: L3 cache descriptor
@@ -359,15 +339,13 @@ int amd_get_l3_disable_slot(struct amd_northbridge *nb, unsigned slot)
return -1;
}
-static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
+static ssize_t show_cache_disable(struct cacheinfo *this_leaf, char *buf,
unsigned int slot)
{
int index;
+ struct amd_northbridge *nb = this_leaf->priv;
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
- return -EINVAL;
-
- index = amd_get_l3_disable_slot(this_leaf->base.nb, slot);
+ index = amd_get_l3_disable_slot(nb, slot);
if (index >= 0)
return sprintf(buf, "%d\n", index);
@@ -376,9 +354,10 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf,
#define SHOW_CACHE_DISABLE(slot) \
static ssize_t \
-show_cache_disable_##slot(struct _cpuid4_info *this_leaf, char *buf, \
- unsigned int cpu) \
+cache_disable_##slot##_show(struct device *dev, \
+ struct device_attribute *attr, char *buf) \
{ \
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
return show_cache_disable(this_leaf, buf, slot); \
}
SHOW_CACHE_DISABLE(0)
@@ -446,25 +425,23 @@ int amd_set_l3_disable_slot(struct amd_northbridge *nb, int cpu, unsigned slot,
return 0;
}
-static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
- const char *buf, size_t count,
- unsigned int slot)
+static ssize_t store_cache_disable(struct cacheinfo *this_leaf,
+ const char *buf, size_t count,
+ unsigned int slot)
{
unsigned long val = 0;
int cpu, err = 0;
+ struct amd_northbridge *nb = this_leaf->priv;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
- return -EINVAL;
-
- cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map));
+ cpu = cpumask_first(&this_leaf->shared_cpu_map);
if (kstrtoul(buf, 10, &val) < 0)
return -EINVAL;
- err = amd_set_l3_disable_slot(this_leaf->base.nb, cpu, slot, val);
+ err = amd_set_l3_disable_slot(nb, cpu, slot, val);
if (err) {
if (err == -EEXIST)
pr_warning("L3 slot %d in use/index already disabled!\n",
@@ -476,41 +453,36 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf,
#define STORE_CACHE_DISABLE(slot) \
static ssize_t \
-store_cache_disable_##slot(struct _cpuid4_info *this_leaf, \
- const char *buf, size_t count, \
- unsigned int cpu) \
+cache_disable_##slot##_store(struct device *dev, \
+ struct device_attribute *attr, \
+ const char *buf, size_t count) \
{ \
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev); \
return store_cache_disable(this_leaf, buf, count, slot); \
}
STORE_CACHE_DISABLE(0)
STORE_CACHE_DISABLE(1)
-static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644,
- show_cache_disable_0, store_cache_disable_0);
-static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644,
- show_cache_disable_1, store_cache_disable_1);
-
-static ssize_t
-show_subcaches(struct _cpuid4_info *this_leaf, char *buf, unsigned int cpu)
+static ssize_t subcaches_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
{
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- return -EINVAL;
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&this_leaf->shared_cpu_map);
return sprintf(buf, "%x\n", amd_get_subcaches(cpu));
}
-static ssize_t
-store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
- unsigned int cpu)
+static ssize_t subcaches_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
{
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+ int cpu = cpumask_first(&this_leaf->shared_cpu_map);
unsigned long val;
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
- if (!this_leaf->base.nb || !amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- return -EINVAL;
-
if (kstrtoul(buf, 16, &val) < 0)
return -EINVAL;
@@ -520,9 +492,92 @@ store_subcaches(struct _cpuid4_info *this_leaf, const char *buf, size_t count,
return count;
}
-static struct _cache_attr subcaches =
- __ATTR(subcaches, 0644, show_subcaches, store_subcaches);
+static DEVICE_ATTR_RW(cache_disable_0);
+static DEVICE_ATTR_RW(cache_disable_1);
+static DEVICE_ATTR_RW(subcaches);
+
+static umode_t
+cache_private_attrs_is_visible(struct kobject *kobj,
+ struct attribute *attr, int unused)
+{
+ struct device *dev = kobj_to_dev(kobj);
+ struct cacheinfo *this_leaf = dev_get_drvdata(dev);
+ umode_t mode = attr->mode;
+
+ if (!this_leaf->priv)
+ return 0;
+
+ if ((attr == &dev_attr_subcaches.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ return mode;
+
+ if ((attr == &dev_attr_cache_disable_0.attr ||
+ attr == &dev_attr_cache_disable_1.attr) &&
+ amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ return mode;
+
+ return 0;
+}
+
+static struct attribute_group cache_private_group = {
+ .is_visible = cache_private_attrs_is_visible,
+};
+
+static void init_amd_l3_attrs(void)
+{
+ int n = 1;
+ static struct attribute **amd_l3_attrs;
+
+ if (amd_l3_attrs) /* already initialized */
+ return;
+
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
+ n += 2;
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ n += 1;
+
+ amd_l3_attrs = kcalloc(n, sizeof(*amd_l3_attrs), GFP_KERNEL);
+ if (!amd_l3_attrs)
+ return;
+
+ n = 0;
+ if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_0.attr;
+ amd_l3_attrs[n++] = &dev_attr_cache_disable_1.attr;
+ }
+ if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
+ amd_l3_attrs[n++] = &dev_attr_subcaches.attr;
+ cache_private_group.attrs = amd_l3_attrs;
+}
+
+const struct attribute_group *
+cache_get_priv_group(struct cacheinfo *this_leaf)
+{
+ struct amd_northbridge *nb = this_leaf->priv;
+
+ if (this_leaf->level < 3 || !nb)
+ return NULL;
+
+ if (nb && nb->l3_cache.indices)
+ init_amd_l3_attrs();
+
+ return &cache_private_group;
+}
+
+static void amd_init_l3_cache(struct _cpuid4_info_regs *this_leaf, int index)
+{
+ int node;
+
+ /* only for L3, and not in virtualized environments */
+ if (index < 3)
+ return;
+
+ node = amd_get_nb_id(smp_processor_id());
+ this_leaf->nb = node_to_amd_nb(node);
+ if (this_leaf->nb && !this_leaf->nb->l3_cache.indices)
+ amd_calc_l3_indices(this_leaf->nb);
+}
#else
#define amd_init_l3_cache(x, y)
#endif /* CONFIG_AMD_NB && CONFIG_SYSFS */
@@ -546,7 +601,7 @@ cpuid4_cache_lookup_regs(int index, struct _cpuid4_info_regs *this_leaf)
cpuid_count(4, index, &eax.full, &ebx.full, &ecx.full, &edx);
}
- if (eax.split.type == CACHE_TYPE_NULL)
+ if (eax.split.type == CTYPE_NULL)
return -EIO; /* better error ? */
this_leaf->eax = eax;
@@ -575,7 +630,7 @@ static int find_num_cache_leaves(struct cpuinfo_x86 *c)
/* Do cpuid(op) loop to find out num_cache_leaves */
cpuid_count(op, i, &eax, &ebx, &ecx, &edx);
cache_eax.full = eax;
- } while (cache_eax.split.type != CACHE_TYPE_NULL);
+ } while (cache_eax.split.type != CTYPE_NULL);
return i;
}
@@ -626,9 +681,9 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
switch (this_leaf.eax.split.level) {
case 1:
- if (this_leaf.eax.split.type == CACHE_TYPE_DATA)
+ if (this_leaf.eax.split.type == CTYPE_DATA)
new_l1d = this_leaf.size/1024;
- else if (this_leaf.eax.split.type == CACHE_TYPE_INST)
+ else if (this_leaf.eax.split.type == CTYPE_INST)
new_l1i = this_leaf.size/1024;
break;
case 2:
@@ -747,55 +802,52 @@ unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c)
return l2;
}
-#ifdef CONFIG_SYSFS
-
-/* pointer to _cpuid4_info array (for each cache leaf) */
-static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
-#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
-
-#ifdef CONFIG_SMP
-
-static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
+static int __cache_amd_cpumap_setup(unsigned int cpu, int index,
+ struct _cpuid4_info_regs *base)
{
- struct _cpuid4_info *this_leaf;
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *this_leaf;
int i, sibling;
if (cpu_has_topoext) {
unsigned int apicid, nshared, first, last;
- if (!per_cpu(ici_cpuid4_info, cpu))
- return 0;
-
- this_leaf = CPUID4_INFO_IDX(cpu, index);
- nshared = this_leaf->base.eax.split.num_threads_sharing + 1;
+ this_leaf = this_cpu_ci->info_list + index;
+ nshared = base->eax.split.num_threads_sharing + 1;
apicid = cpu_data(cpu).apicid;
first = apicid - (apicid % nshared);
last = first + nshared - 1;
for_each_online_cpu(i) {
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
+ continue;
+
apicid = cpu_data(i).apicid;
if ((apicid < first) || (apicid > last))
continue;
- if (!per_cpu(ici_cpuid4_info, i))
- continue;
- this_leaf = CPUID4_INFO_IDX(i, index);
+
+ this_leaf = this_cpu_ci->info_list + index;
for_each_online_cpu(sibling) {
apicid = cpu_data(sibling).apicid;
if ((apicid < first) || (apicid > last))
continue;
- set_bit(sibling, this_leaf->shared_cpu_map);
+ cpumask_set_cpu(sibling,
+ &this_leaf->shared_cpu_map);
}
}
} else if (index == 3) {
for_each_cpu(i, cpu_llc_shared_mask(cpu)) {
- if (!per_cpu(ici_cpuid4_info, i))
+ this_cpu_ci = get_cpu_cacheinfo(i);
+ if (!this_cpu_ci->info_list)
continue;
- this_leaf = CPUID4_INFO_IDX(i, index);
+ this_leaf = this_cpu_ci->info_list + index;
for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) {
if (!cpu_online(sibling))
continue;
- set_bit(sibling, this_leaf->shared_cpu_map);
+ cpumask_set_cpu(sibling,
+ &this_leaf->shared_cpu_map);
}
}
} else
@@ -804,457 +856,86 @@ static int cache_shared_amd_cpu_map_setup(unsigned int cpu, int index)
return 1;
}
-static void cache_shared_cpu_map_setup(unsigned int cpu, int index)
+static void __cache_cpumap_setup(unsigned int cpu, int index,
+ struct _cpuid4_info_regs *base)
{
- struct _cpuid4_info *this_leaf, *sibling_leaf;
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *this_leaf, *sibling_leaf;
unsigned long num_threads_sharing;
int index_msb, i;
struct cpuinfo_x86 *c = &cpu_data(cpu);
if (c->x86_vendor == X86_VENDOR_AMD) {
- if (cache_shared_amd_cpu_map_setup(cpu, index))
+ if (__cache_amd_cpumap_setup(cpu, index, base))
return;
}
- this_leaf = CPUID4_INFO_IDX(cpu, index);
- num_threads_sharing = 1 + this_leaf->base.eax.split.num_threads_sharing;
+ this_leaf = this_cpu_ci->info_list + index;
+ num_threads_sharing = 1 + base->eax.split.num_threads_sharing;
+ cpumask_set_cpu(cpu, &this_leaf->shared_cpu_map);
if (num_threads_sharing == 1)
- cpumask_set_cpu(cpu, to_cpumask(this_leaf->shared_cpu_map));
- else {
- index_msb = get_count_order(num_threads_sharing);
-
- for_each_online_cpu(i) {
- if (cpu_data(i).apicid >> index_msb ==
- c->apicid >> index_msb) {
- cpumask_set_cpu(i,
- to_cpumask(this_leaf->shared_cpu_map));
- if (i != cpu && per_cpu(ici_cpuid4_info, i)) {
- sibling_leaf =
- CPUID4_INFO_IDX(i, index);
- cpumask_set_cpu(cpu, to_cpumask(
- sibling_leaf->shared_cpu_map));
- }
- }
- }
- }
-}
-static void cache_remove_shared_cpu_map(unsigned int cpu, int index)
-{
- struct _cpuid4_info *this_leaf, *sibling_leaf;
- int sibling;
-
- this_leaf = CPUID4_INFO_IDX(cpu, index);
- for_each_cpu(sibling, to_cpumask(this_leaf->shared_cpu_map)) {
- sibling_leaf = CPUID4_INFO_IDX(sibling, index);
- cpumask_clear_cpu(cpu,
- to_cpumask(sibling_leaf->shared_cpu_map));
- }
-}
-#else
-static void cache_shared_cpu_map_setup(unsigned int cpu, int index)
-{
-}
-
-static void cache_remove_shared_cpu_map(unsigned int cpu, int index)
-{
-}
-#endif
-
-static void free_cache_attributes(unsigned int cpu)
-{
- int i;
-
- for (i = 0; i < num_cache_leaves; i++)
- cache_remove_shared_cpu_map(cpu, i);
-
- kfree(per_cpu(ici_cpuid4_info, cpu));
- per_cpu(ici_cpuid4_info, cpu) = NULL;
-}
-
-static void get_cpu_leaves(void *_retval)
-{
- int j, *retval = _retval, cpu = smp_processor_id();
+ return;
- /* Do cpuid and store the results */
- for (j = 0; j < num_cache_leaves; j++) {
- struct _cpuid4_info *this_leaf = CPUID4_INFO_IDX(cpu, j);
+ index_msb = get_count_order(num_threads_sharing);
- *retval = cpuid4_cache_lookup_regs(j, &this_leaf->base);
- if (unlikely(*retval < 0)) {
- int i;
+ for_each_online_cpu(i)
+ if (cpu_data(i).apicid >> index_msb == c->apicid >> index_msb) {
+ struct cpu_cacheinfo *sib_cpu_ci = get_cpu_cacheinfo(i);
- for (i = 0; i < j; i++)
- cache_remove_shared_cpu_map(cpu, i);
- break;
+ if (i == cpu || !sib_cpu_ci->info_list)
+ continue;/* skip if itself or no cacheinfo */
+ sibling_leaf = sib_cpu_ci->info_list + index;
+ cpumask_set_cpu(i, &this_leaf->shared_cpu_map);
+ cpumask_set_cpu(cpu, &sibling_leaf->shared_cpu_map);
}
- cache_shared_cpu_map_setup(cpu, j);
- }
}
-static int detect_cache_attributes(unsigned int cpu)
+static void ci_leaf_init(struct cacheinfo *this_leaf,
+ struct _cpuid4_info_regs *base)
{
- int retval;
-
- if (num_cache_leaves == 0)
- return -ENOENT;
-
- per_cpu(ici_cpuid4_info, cpu) = kzalloc(
- sizeof(struct _cpuid4_info) * num_cache_leaves, GFP_KERNEL);
- if (per_cpu(ici_cpuid4_info, cpu) == NULL)
- return -ENOMEM;
-
- smp_call_function_single(cpu, get_cpu_leaves, &retval, true);
- if (retval) {
- kfree(per_cpu(ici_cpuid4_info, cpu));
- per_cpu(ici_cpuid4_info, cpu) = NULL;
- }
-
- return retval;
+ this_leaf->level = base->eax.split.level;
+ this_leaf->type = cache_type_map[base->eax.split.type];
+ this_leaf->coherency_line_size =
+ base->ebx.split.coherency_line_size + 1;
+ this_leaf->ways_of_associativity =
+ base->ebx.split.ways_of_associativity + 1;
+ this_leaf->size = base->size;
+ this_leaf->number_of_sets = base->ecx.split.number_of_sets + 1;
+ this_leaf->physical_line_partition =
+ base->ebx.split.physical_line_partition + 1;
+ this_leaf->priv = base->nb;
}
-#include <linux/kobject.h>
-#include <linux/sysfs.h>
-#include <linux/cpu.h>
-
-/* pointer to kobject for cpuX/cache */
-static DEFINE_PER_CPU(struct kobject *, ici_cache_kobject);
-
-struct _index_kobject {
- struct kobject kobj;
- unsigned int cpu;
- unsigned short index;
-};
-
-/* pointer to array of kobjects for cpuX/cache/indexY */
-static DEFINE_PER_CPU(struct _index_kobject *, ici_index_kobject);
-#define INDEX_KOBJECT_PTR(x, y) (&((per_cpu(ici_index_kobject, x))[y]))
-
-#define show_one_plus(file_name, object, val) \
-static ssize_t show_##file_name(struct _cpuid4_info *this_leaf, char *buf, \
- unsigned int cpu) \
-{ \
- return sprintf(buf, "%lu\n", (unsigned long)this_leaf->object + val); \
-}
-
-show_one_plus(level, base.eax.split.level, 0);
-show_one_plus(coherency_line_size, base.ebx.split.coherency_line_size, 1);
-show_one_plus(physical_line_partition, base.ebx.split.physical_line_partition, 1);
-show_one_plus(ways_of_associativity, base.ebx.split.ways_of_associativity, 1);
-show_one_plus(number_of_sets, base.ecx.split.number_of_sets, 1);
-
-static ssize_t show_size(struct _cpuid4_info *this_leaf, char *buf,
- unsigned int cpu)
-{
- return sprintf(buf, "%luK\n", this_leaf->base.size / 1024);
-}
-
-static ssize_t show_shared_cpu_map_func(struct _cpuid4_info *this_leaf,
- int type, char *buf)
-{
- const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map);
- int ret;
-
- if (type)
- ret = scnprintf(buf, PAGE_SIZE - 1, "%*pbl",
- cpumask_pr_args(mask));
- else
- ret = scnprintf(buf, PAGE_SIZE - 1, "%*pb",
- cpumask_pr_args(mask));
- buf[ret++] = '\n';
- buf[ret] = '\0';
- return ret;
-}
-
-static inline ssize_t show_shared_cpu_map(struct _cpuid4_info *leaf, char *buf,
- unsigned int cpu)
+static int __init_cache_level(unsigned int cpu)
{
- return show_shared_cpu_map_func(leaf, 0, buf);
-}
-
-static inline ssize_t show_shared_cpu_list(struct _cpuid4_info *leaf, char *buf,
- unsigned int cpu)
-{
- return show_shared_cpu_map_func(leaf, 1, buf);
-}
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
-static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf,
- unsigned int cpu)
-{
- switch (this_leaf->base.eax.split.type) {
- case CACHE_TYPE_DATA:
- return sprintf(buf, "Data\n");
- case CACHE_TYPE_INST:
- return sprintf(buf, "Instruction\n");
- case CACHE_TYPE_UNIFIED:
- return sprintf(buf, "Unified\n");
- default:
- return sprintf(buf, "Unknown\n");
- }
-}
-
-#define to_object(k) container_of(k, struct _index_kobject, kobj)
-#define to_attr(a) container_of(a, struct _cache_attr, attr)
-
-#define define_one_ro(_name) \
-static struct _cache_attr _name = \
- __ATTR(_name, 0444, show_##_name, NULL)
-
-define_one_ro(level);
-define_one_ro(type);
-define_one_ro(coherency_line_size);
-define_one_ro(physical_line_partition);
-define_one_ro(ways_of_associativity);
-define_one_ro(number_of_sets);
-define_one_ro(size);
-define_one_ro(shared_cpu_map);
-define_one_ro(shared_cpu_list);
-
-static struct attribute *default_attrs[] = {
- &type.attr,
- &level.attr,
- &coherency_line_size.attr,
- &physical_line_partition.attr,
- &ways_of_associativity.attr,
- &number_of_sets.attr,
- &size.attr,
- &shared_cpu_map.attr,
- &shared_cpu_list.attr,
- NULL
-};
-
-#ifdef CONFIG_AMD_NB
-static struct attribute **amd_l3_attrs(void)
-{
- static struct attribute **attrs;
- int n;
-
- if (attrs)
- return attrs;
-
- n = ARRAY_SIZE(default_attrs);
-
- if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE))
- n += 2;
-
- if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- n += 1;
-
- attrs = kzalloc(n * sizeof (struct attribute *), GFP_KERNEL);
- if (attrs == NULL)
- return attrs = default_attrs;
-
- for (n = 0; default_attrs[n]; n++)
- attrs[n] = default_attrs[n];
-
- if (amd_nb_has_feature(AMD_NB_L3_INDEX_DISABLE)) {
- attrs[n++] = &cache_disable_0.attr;
- attrs[n++] = &cache_disable_1.attr;
- }
-
- if (amd_nb_has_feature(AMD_NB_L3_PARTITIONING))
- attrs[n++] = &subcaches.attr;
-
- return attrs;
-}
-#endif
-
-static ssize_t show(struct kobject *kobj, struct attribute *attr, char *buf)
-{
- struct _cache_attr *fattr = to_attr(attr);
- struct _index_kobject *this_leaf = to_object(kobj);
- ssize_t ret;
-
- ret = fattr->show ?
- fattr->show(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
- buf, this_leaf->cpu) :
- 0;
- return ret;
-}
-
-static ssize_t store(struct kobject *kobj, struct attribute *attr,
- const char *buf, size_t count)
-{
- struct _cache_attr *fattr = to_attr(attr);
- struct _index_kobject *this_leaf = to_object(kobj);
- ssize_t ret;
-
- ret = fattr->store ?
- fattr->store(CPUID4_INFO_IDX(this_leaf->cpu, this_leaf->index),
- buf, count, this_leaf->cpu) :
- 0;
- return ret;
-}
-
-static const struct sysfs_ops sysfs_ops = {
- .show = show,
- .store = store,
-};
-
-static struct kobj_type ktype_cache = {
- .sysfs_ops = &sysfs_ops,
- .default_attrs = default_attrs,
-};
-
-static struct kobj_type ktype_percpu_entry = {
- .sysfs_ops = &sysfs_ops,
-};
-
-static void cpuid4_cache_sysfs_exit(unsigned int cpu)
-{
- kfree(per_cpu(ici_cache_kobject, cpu));
- kfree(per_cpu(ici_index_kobject, cpu));
- per_cpu(ici_cache_kobject, cpu) = NULL;
- per_cpu(ici_index_kobject, cpu) = NULL;
- free_cache_attributes(cpu);
-}
-
-static int cpuid4_cache_sysfs_init(unsigned int cpu)
-{
- int err;
-
- if (num_cache_leaves == 0)
+ if (!num_cache_leaves)
return -ENOENT;
-
- err = detect_cache_attributes(cpu);
- if (err)
- return err;
-
- /* Allocate all required memory */
- per_cpu(ici_cache_kobject, cpu) =
- kzalloc(sizeof(struct kobject), GFP_KERNEL);
- if (unlikely(per_cpu(ici_cache_kobject, cpu) == NULL))
- goto err_out;
-
- per_cpu(ici_index_kobject, cpu) = kzalloc(
- sizeof(struct _index_kobject) * num_cache_leaves, GFP_KERNEL);
- if (unlikely(per_cpu(ici_index_kobject, cpu) == NULL))
- goto err_out;
-
+ if (!this_cpu_ci)
+ return -EINVAL;
+ this_cpu_ci->num_levels = 3;
+ this_cpu_ci->num_leaves = num_cache_leaves;
return 0;
-
-err_out:
- cpuid4_cache_sysfs_exit(cpu);
- return -ENOMEM;
}
-static DECLARE_BITMAP(cache_dev_map, NR_CPUS);
-
-/* Add/Remove cache interface for CPU device */
-static int cache_add_dev(struct device *dev)
+static int __populate_cache_leaves(unsigned int cpu)
{
- unsigned int cpu = dev->id;
- unsigned long i, j;
- struct _index_kobject *this_object;
- struct _cpuid4_info *this_leaf;
- int retval;
-
- retval = cpuid4_cache_sysfs_init(cpu);
- if (unlikely(retval < 0))
- return retval;
-
- retval = kobject_init_and_add(per_cpu(ici_cache_kobject, cpu),
- &ktype_percpu_entry,
- &dev->kobj, "%s", "cache");
- if (retval < 0) {
- cpuid4_cache_sysfs_exit(cpu);
- return retval;
- }
+ unsigned int idx, ret;
+ struct cpu_cacheinfo *this_cpu_ci = get_cpu_cacheinfo(cpu);
+ struct cacheinfo *this_leaf = this_cpu_ci->info_list;
+ struct _cpuid4_info_regs id4_regs = {};
- for (i = 0; i < num_cache_leaves; i++) {
- this_object = INDEX_KOBJECT_PTR(cpu, i);
- this_object->cpu = cpu;
- this_object->index = i;
-
- this_leaf = CPUID4_INFO_IDX(cpu, i);
-
- ktype_cache.default_attrs = default_attrs;
-#ifdef CONFIG_AMD_NB
- if (this_leaf->base.nb)
- ktype_cache.default_attrs = amd_l3_attrs();
-#endif
- retval = kobject_init_and_add(&(this_object->kobj),
- &ktype_cache,
- per_cpu(ici_cache_kobject, cpu),
- "index%1lu", i);
- if (unlikely(retval)) {
- for (j = 0; j < i; j++)
- kobject_put(&(INDEX_KOBJECT_PTR(cpu, j)->kobj));
- kobject_put(per_cpu(ici_cache_kobject, cpu));
- cpuid4_cache_sysfs_exit(cpu);
- return retval;
- }
- kobject_uevent(&(this_object->kobj), KOBJ_ADD);
+ for (idx = 0; idx < this_cpu_ci->num_leaves; idx++) {
+ ret = cpuid4_cache_lookup_regs(idx, &id4_regs);
+ if (ret)
+ return ret;
+ ci_leaf_init(this_leaf++, &id4_regs);
+ __cache_cpumap_setup(cpu, idx, &id4_regs);
}
- cpumask_set_cpu(cpu, to_cpumask(cache_dev_map));
-
- kobject_uevent(per_cpu(ici_cache_kobject, cpu), KOBJ_ADD);
return 0;
}
-static void cache_remove_dev(struct device *dev)
-{
- unsigned int cpu = dev->id;
- unsigned long i;
-
- if (per_cpu(ici_cpuid4_info, cpu) == NULL)
- return;
- if (!cpumask_test_cpu(cpu, to_cpumask(cache_dev_map)))
- return;
- cpumask_clear_cpu(cpu, to_cpumask(cache_dev_map));
-
- for (i = 0; i < num_cache_leaves; i++)
- kobject_put(&(INDEX_KOBJECT_PTR(cpu, i)->kobj));
- kobject_put(per_cpu(ici_cache_kobject, cpu));
- cpuid4_cache_sysfs_exit(cpu);
-}
-
-static int cacheinfo_cpu_callback(struct notifier_block *nfb,
- unsigned long action, void *hcpu)
-{
- unsigned int cpu = (unsigned long)hcpu;
- struct device *dev;
-
- dev = get_cpu_device(cpu);
- switch (action) {
- case CPU_ONLINE:
- case CPU_ONLINE_FROZEN:
- cache_add_dev(dev);
- break;
- case CPU_DEAD:
- case CPU_DEAD_FROZEN:
- cache_remove_dev(dev);
- break;
- }
- return NOTIFY_OK;
-}
-
-static struct notifier_block cacheinfo_cpu_notifier = {
- .notifier_call = cacheinfo_cpu_callback,
-};
-
-static int __init cache_sysfs_init(void)
-{
- int i, err = 0;
-
- if (num_cache_leaves == 0)
- return 0;
-
- cpu_notifier_register_begin();
- for_each_online_cpu(i) {
- struct device *dev = get_cpu_device(i);
-
- err = cache_add_dev(dev);
- if (err)
- goto out;
- }
- __register_hotcpu_notifier(&cacheinfo_cpu_notifier);
-
-out:
- cpu_notifier_register_done();
- return err;
-}
-
-device_initcall(cache_sysfs_init);
-
-#endif
+DEFINE_SMP_CALL_CACHE_FUNCTION(init_cache_level)
+DEFINE_SMP_CALL_CACHE_FUNCTION(populate_cache_leaves)
diff --git a/arch/x86/kernel/cpu/mcheck/mce-internal.h b/arch/x86/kernel/cpu/mcheck/mce-internal.h
index 10b4690..fe32074 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-internal.h
+++ b/arch/x86/kernel/cpu/mcheck/mce-internal.h
@@ -14,6 +14,7 @@ enum severity_level {
};
#define ATTR_LEN 16
+#define INITIAL_CHECK_INTERVAL 5 * 60 /* 5 minutes */
/* One object for each MCE bank, shared by all CPUs */
struct mce_bank {
@@ -23,20 +24,20 @@ struct mce_bank {
char attrname[ATTR_LEN]; /* attribute name */
};
-int mce_severity(struct mce *a, int tolerant, char **msg, bool is_excp);
+extern int (*mce_severity)(struct mce *a, int tolerant, char **msg, bool is_excp);
struct dentry *mce_get_debugfs_dir(void);
extern struct mce_bank *mce_banks;
extern mce_banks_t mce_banks_ce_disabled;
#ifdef CONFIG_X86_MCE_INTEL
-unsigned long mce_intel_adjust_timer(unsigned long interval);
-void mce_intel_cmci_poll(void);
+unsigned long cmci_intel_adjust_timer(unsigned long interval);
+bool mce_intel_cmci_poll(void);
void mce_intel_hcpu_update(unsigned long cpu);
void cmci_disable_bank(int bank);
#else
-# define mce_intel_adjust_timer mce_adjust_timer_default
-static inline void mce_intel_cmci_poll(void) { }
+# define cmci_intel_adjust_timer mce_adjust_timer_default
+static inline bool mce_intel_cmci_poll(void) { return false; }
static inline void mce_intel_hcpu_update(unsigned long cpu) { }
static inline void cmci_disable_bank(int bank) { }
#endif
diff --git a/arch/x86/kernel/cpu/mcheck/mce-severity.c b/arch/x86/kernel/cpu/mcheck/mce-severity.c
index 8bb4330..9c682c2 100644
--- a/arch/x86/kernel/cpu/mcheck/mce-severity.c
+++ b/arch/x86/kernel/cpu/mcheck/mce-severity.c
@@ -186,7 +186,61 @@ static int error_context(struct mce *m)
return ((m->cs & 3) == 3) ? IN_USER : IN_KERNEL;
}
-int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
+/*
+ * See AMD Error Scope Hierarchy table in a newer BKDG. For example
+ * 49125_15h_Models_30h-3Fh_BKDG.pdf, section "RAS Features"
+ */
+static int mce_severity_amd(struct mce *m, int tolerant, char **msg, bool is_excp)
+{
+ enum context ctx = error_context(m);
+
+ /* Processor Context Corrupt, no need to fumble too much, die! */
+ if (m->status & MCI_STATUS_PCC)
+ return MCE_PANIC_SEVERITY;
+
+ if (m->status & MCI_STATUS_UC) {
+
+ /*
+ * On older systems where overflow_recov flag is not present, we
+ * should simply panic if an error overflow occurs. If
+ * overflow_recov flag is present and set, then software can try
+ * to at least kill process to prolong system operation.
+ */
+ if (mce_flags.overflow_recov) {
+ /* software can try to contain */
+ if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL))
+ return MCE_PANIC_SEVERITY;
+
+ /* kill current process */
+ return MCE_AR_SEVERITY;
+ } else {
+ /* at least one error was not logged */
+ if (m->status & MCI_STATUS_OVER)
+ return MCE_PANIC_SEVERITY;
+ }
+
+ /*
+ * For any other case, return MCE_UC_SEVERITY so that we log the
+ * error and exit #MC handler.
+ */
+ return MCE_UC_SEVERITY;
+ }
+
+ /*
+ * deferred error: poll handler catches these and adds to mce_ring so
+ * memory-failure can take recovery actions.
+ */
+ if (m->status & MCI_STATUS_DEFERRED)
+ return MCE_DEFERRED_SEVERITY;
+
+ /*
+ * corrected error: poll handler catches these and passes responsibility
+ * of decoding the error to EDAC
+ */
+ return MCE_KEEP_SEVERITY;
+}
+
+static int mce_severity_intel(struct mce *m, int tolerant, char **msg, bool is_excp)
{
enum exception excp = (is_excp ? EXCP_CONTEXT : NO_EXCP);
enum context ctx = error_context(m);
@@ -216,6 +270,16 @@ int mce_severity(struct mce *m, int tolerant, char **msg, bool is_excp)
}
}
+/* Default to mce_severity_intel */
+int (*mce_severity)(struct mce *m, int tolerant, char **msg, bool is_excp) =
+ mce_severity_intel;
+
+void __init mcheck_vendor_init_severity(void)
+{
+ if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD)
+ mce_severity = mce_severity_amd;
+}
+
#ifdef CONFIG_DEBUG_FS
static void *s_start(struct seq_file *f, loff_t *pos)
{
diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c
index 3c036cb..e535533 100644
--- a/arch/x86/kernel/cpu/mcheck/mce.c
+++ b/arch/x86/kernel/cpu/mcheck/mce.c
@@ -60,11 +60,12 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex);
#define CREATE_TRACE_POINTS
#include <trace/events/mce.h>
-#define SPINUNIT 100 /* 100ns */
+#define SPINUNIT 100 /* 100ns */
DEFINE_PER_CPU(unsigned, mce_exception_count);
struct mce_bank *mce_banks __read_mostly;
+struct mce_vendor_flags mce_flags __read_mostly;
struct mca_config mca_cfg __read_mostly = {
.bootlog = -1,
@@ -89,9 +90,6 @@ static DECLARE_WAIT_QUEUE_HEAD(mce_chrdev_wait);
static DEFINE_PER_CPU(struct mce, mces_seen);
static int cpu_missing;
-/* CMCI storm detection filter */
-static DEFINE_PER_CPU(unsigned long, mce_polled_error);
-
/*
* MCA banks polled by the period polling timer for corrected events.
* With Intel CMCI, this only has MCA banks which do not support CMCI (if any).
@@ -622,8 +620,9 @@ DEFINE_PER_CPU(unsigned, mce_poll_count);
* is already totally * confused. In this case it's likely it will
* not fully execute the machine check handler either.
*/
-void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
+bool machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
{
+ bool error_logged = false;
struct mce m;
int severity;
int i;
@@ -646,7 +645,7 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
if (!(m.status & MCI_STATUS_VAL))
continue;
- this_cpu_write(mce_polled_error, 1);
+
/*
* Uncorrected or signalled events are handled by the exception
* handler when it is enabled, so don't process those here.
@@ -679,8 +678,10 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
* Don't get the IP here because it's unlikely to
* have anything to do with the actual error location.
*/
- if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce)
+ if (!(flags & MCP_DONTLOG) && !mca_cfg.dont_log_ce) {
+ error_logged = true;
mce_log(&m);
+ }
/*
* Clear state for this bank.
@@ -694,6 +695,8 @@ void machine_check_poll(enum mcp_flags flags, mce_banks_t *b)
*/
sync_core();
+
+ return error_logged;
}
EXPORT_SYMBOL_GPL(machine_check_poll);
@@ -813,7 +816,7 @@ static void mce_reign(void)
* other CPUs.
*/
if (m && global_worst >= MCE_PANIC_SEVERITY && mca_cfg.tolerant < 3)
- mce_panic("Fatal Machine check", m, msg);
+ mce_panic("Fatal machine check", m, msg);
/*
* For UC somewhere we let the CPU who detects it handle it.
@@ -826,7 +829,7 @@ static void mce_reign(void)
* source or one CPU is hung. Panic.
*/
if (global_worst <= MCE_KEEP_SEVERITY && mca_cfg.tolerant < 3)
- mce_panic("Machine check from unknown source", NULL, NULL);
+ mce_panic("Fatal machine check from unknown source", NULL, NULL);
/*
* Now clear all the mces_seen so that they don't reappear on
@@ -1258,7 +1261,7 @@ void mce_log_therm_throt_event(__u64 status)
* poller finds an MCE, poll 2x faster. When the poller finds no more
* errors, poll 2x slower (up to check_interval seconds).
*/
-static unsigned long check_interval = 5 * 60; /* 5 minutes */
+static unsigned long check_interval = INITIAL_CHECK_INTERVAL;
static DEFINE_PER_CPU(unsigned long, mce_next_interval); /* in jiffies */
static DEFINE_PER_CPU(struct timer_list, mce_timer);
@@ -1268,49 +1271,57 @@ static unsigned long mce_adjust_timer_default(unsigned long interval)
return interval;
}
-static unsigned long (*mce_adjust_timer)(unsigned long interval) =
- mce_adjust_timer_default;
+static unsigned long (*mce_adjust_timer)(unsigned long interval) = mce_adjust_timer_default;
-static int cmc_error_seen(void)
+static void __restart_timer(struct timer_list *t, unsigned long interval)
{
- unsigned long *v = this_cpu_ptr(&mce_polled_error);
+ unsigned long when = jiffies + interval;
+ unsigned long flags;
+
+ local_irq_save(flags);
- return test_and_clear_bit(0, v);
+ if (timer_pending(t)) {
+ if (time_before(when, t->expires))
+ mod_timer_pinned(t, when);
+ } else {
+ t->expires = round_jiffies(when);
+ add_timer_on(t, smp_processor_id());
+ }
+
+ local_irq_restore(flags);
}
static void mce_timer_fn(unsigned long data)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
+ int cpu = smp_processor_id();
unsigned long iv;
- int notify;
- WARN_ON(smp_processor_id() != data);
+ WARN_ON(cpu != data);
+
+ iv = __this_cpu_read(mce_next_interval);
if (mce_available(this_cpu_ptr(&cpu_info))) {
- machine_check_poll(MCP_TIMESTAMP,
- this_cpu_ptr(&mce_poll_banks));
- mce_intel_cmci_poll();
+ machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_poll_banks));
+
+ if (mce_intel_cmci_poll()) {
+ iv = mce_adjust_timer(iv);
+ goto done;
+ }
}
/*
- * Alert userspace if needed. If we logged an MCE, reduce the
- * polling interval, otherwise increase the polling interval.
+ * Alert userspace if needed. If we logged an MCE, reduce the polling
+ * interval, otherwise increase the polling interval.
*/
- iv = __this_cpu_read(mce_next_interval);
- notify = mce_notify_irq();
- notify |= cmc_error_seen();
- if (notify) {
+ if (mce_notify_irq())
iv = max(iv / 2, (unsigned long) HZ/100);
- } else {
+ else
iv = min(iv * 2, round_jiffies_relative(check_interval * HZ));
- iv = mce_adjust_timer(iv);
- }
+
+done:
__this_cpu_write(mce_next_interval, iv);
- /* Might have become 0 after CMCI storm subsided */
- if (iv) {
- t->expires = jiffies + iv;
- add_timer_on(t, smp_processor_id());
- }
+ __restart_timer(t, iv);
}
/*
@@ -1319,16 +1330,10 @@ static void mce_timer_fn(unsigned long data)
void mce_timer_kick(unsigned long interval)
{
struct timer_list *t = this_cpu_ptr(&mce_timer);
- unsigned long when = jiffies + interval;
unsigned long iv = __this_cpu_read(mce_next_interval);
- if (timer_pending(t)) {
- if (time_before(when, t->expires))
- mod_timer_pinned(t, when);
- } else {
- t->expires = round_jiffies(when);
- add_timer_on(t, smp_processor_id());
- }
+ __restart_timer(t, interval);
+
if (interval < iv)
__this_cpu_write(mce_next_interval, interval);
}
@@ -1525,45 +1530,46 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c)
* Various K7s with broken bank 0 around. Always disable
* by default.
*/
- if (c->x86 == 6 && cfg->banks > 0)
+ if (c->x86 == 6 && cfg->banks > 0)
mce_banks[0].ctl = 0;
- /*
- * Turn off MC4_MISC thresholding banks on those models since
- * they're not supported there.
- */
- if (c->x86 == 0x15 &&
- (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
- int i;
- u64 val, hwcr;
- bool need_toggle;
- u32 msrs[] = {
+ /*
+ * overflow_recov is supported for F15h Models 00h-0fh
+ * even though we don't have a CPUID bit for it.
+ */
+ if (c->x86 == 0x15 && c->x86_model <= 0xf)
+ mce_flags.overflow_recov = 1;
+
+ /*
+ * Turn off MC4_MISC thresholding banks on those models since
+ * they're not supported there.
+ */
+ if (c->x86 == 0x15 &&
+ (c->x86_model >= 0x10 && c->x86_model <= 0x1f)) {
+ int i;
+ u64 hwcr;
+ bool need_toggle;
+ u32 msrs[] = {
0x00000413, /* MC4_MISC0 */
0xc0000408, /* MC4_MISC1 */
- };
+ };
- rdmsrl(MSR_K7_HWCR, hwcr);
+ rdmsrl(MSR_K7_HWCR, hwcr);
- /* McStatusWrEn has to be set */
- need_toggle = !(hwcr & BIT(18));
+ /* McStatusWrEn has to be set */
+ need_toggle = !(hwcr & BIT(18));
- if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr | BIT(18));
- for (i = 0; i < ARRAY_SIZE(msrs); i++) {
- rdmsrl(msrs[i], val);
+ /* Clear CntP bit safely */
+ for (i = 0; i < ARRAY_SIZE(msrs); i++)
+ msr_clear_bit(msrs[i], 62);
- /* CntP bit set? */
- if (val & BIT_64(62)) {
- val &= ~BIT_64(62);
- wrmsrl(msrs[i], val);
- }
- }
-
- /* restore old settings */
- if (need_toggle)
- wrmsrl(MSR_K7_HWCR, hwcr);
- }
+ /* restore old settings */
+ if (need_toggle)
+ wrmsrl(MSR_K7_HWCR, hwcr);
+ }
}
if (c->x86_vendor == X86_VENDOR_INTEL) {
@@ -1629,10 +1635,11 @@ static void __mcheck_cpu_init_vendor(struct cpuinfo_x86 *c)
switch (c->x86_vendor) {
case X86_VENDOR_INTEL:
mce_intel_feature_init(c);
- mce_adjust_timer = mce_intel_adjust_timer;
+ mce_adjust_timer = cmci_intel_adjust_timer;
break;
case X86_VENDOR_AMD:
mce_amd_feature_init(c);
+ mce_flags.overflow_recov = cpuid_ebx(0x80000007) & 0x1;
break;
default:
break;
@@ -2017,6 +2024,7 @@ __setup("mce", mcheck_enable);
int __init mcheck_init(void)
{
mcheck_intel_therm_init();
+ mcheck_vendor_init_severity();
return 0;
}
diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c
index f1c3769..55ad9b3 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_amd.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c
@@ -79,7 +79,7 @@ static inline bool is_shared_bank(int bank)
return (bank == 4);
}
-static const char * const bank4_names(struct threshold_block *b)
+static const char *bank4_names(const struct threshold_block *b)
{
switch (b->address) {
/* MSR4_MISC0 */
@@ -250,6 +250,7 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c)
if (!b.interrupt_capable)
goto init;
+ b.interrupt_enable = 1;
new = (high & MASK_LVTOFF_HI) >> 20;
offset = setup_APIC_mce(offset, new);
@@ -322,6 +323,8 @@ static void amd_threshold_interrupt(void)
log:
mce_setup(&m);
rdmsrl(MSR_IA32_MCx_STATUS(bank), m.status);
+ if (!(m.status & MCI_STATUS_VAL))
+ return;
m.misc = ((u64)high << 32) | low;
m.bank = bank;
mce_log(&m);
@@ -497,10 +500,12 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank,
b->interrupt_capable = lvt_interrupt_supported(bank, high);
b->threshold_limit = THRESHOLD_MAX;
- if (b->interrupt_capable)
+ if (b->interrupt_capable) {
threshold_ktype.default_attrs[2] = &interrupt_enable.attr;
- else
+ b->interrupt_enable = 1;
+ } else {
threshold_ktype.default_attrs[2] = NULL;
+ }
INIT_LIST_HEAD(&b->miscj);
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index b3c97ba..b4a41cf 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -39,6 +39,15 @@
static DEFINE_PER_CPU(mce_banks_t, mce_banks_owned);
/*
+ * CMCI storm detection backoff counter
+ *
+ * During storm, we reset this counter to INITIAL_CHECK_INTERVAL in case we've
+ * encountered an error. If not, we decrement it by one. We signal the end of
+ * the CMCI storm when it reaches 0.
+ */
+static DEFINE_PER_CPU(int, cmci_backoff_cnt);
+
+/*
* cmci_discover_lock protects against parallel discovery attempts
* which could race against each other.
*/
@@ -46,7 +55,7 @@ static DEFINE_RAW_SPINLOCK(cmci_discover_lock);
#define CMCI_THRESHOLD 1
#define CMCI_POLL_INTERVAL (30 * HZ)
-#define CMCI_STORM_INTERVAL (1 * HZ)
+#define CMCI_STORM_INTERVAL (HZ)
#define CMCI_STORM_THRESHOLD 15
static DEFINE_PER_CPU(unsigned long, cmci_time_stamp);
@@ -82,11 +91,21 @@ static int cmci_supported(int *banks)
return !!(cap & MCG_CMCI_P);
}
-void mce_intel_cmci_poll(void)
+bool mce_intel_cmci_poll(void)
{
if (__this_cpu_read(cmci_storm_state) == CMCI_STORM_NONE)
- return;
- machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
+ return false;
+
+ /*
+ * Reset the counter if we've logged an error in the last poll
+ * during the storm.
+ */
+ if (machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned)))
+ this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
+ else
+ this_cpu_dec(cmci_backoff_cnt);
+
+ return true;
}
void mce_intel_hcpu_update(unsigned long cpu)
@@ -97,31 +116,32 @@ void mce_intel_hcpu_update(unsigned long cpu)
per_cpu(cmci_storm_state, cpu) = CMCI_STORM_NONE;
}
-unsigned long mce_intel_adjust_timer(unsigned long interval)
+unsigned long cmci_intel_adjust_timer(unsigned long interval)
{
- int r;
-
- if (interval < CMCI_POLL_INTERVAL)
- return interval;
+ if ((this_cpu_read(cmci_backoff_cnt) > 0) &&
+ (__this_cpu_read(cmci_storm_state) == CMCI_STORM_ACTIVE)) {
+ mce_notify_irq();
+ return CMCI_STORM_INTERVAL;
+ }
switch (__this_cpu_read(cmci_storm_state)) {
case CMCI_STORM_ACTIVE:
+
/*
* We switch back to interrupt mode once the poll timer has
- * silenced itself. That means no events recorded and the
- * timer interval is back to our poll interval.
+ * silenced itself. That means no events recorded and the timer
+ * interval is back to our poll interval.
*/
__this_cpu_write(cmci_storm_state, CMCI_STORM_SUBSIDED);
- r = atomic_sub_return(1, &cmci_storm_on_cpus);
- if (r == 0)
+ if (!atomic_sub_return(1, &cmci_storm_on_cpus))
pr_notice("CMCI storm subsided: switching to interrupt mode\n");
+
/* FALLTHROUGH */
case CMCI_STORM_SUBSIDED:
/*
- * We wait for all cpus to go back to SUBSIDED
- * state. When that happens we switch back to
- * interrupt mode.
+ * We wait for all CPUs to go back to SUBSIDED state. When that
+ * happens we switch back to interrupt mode.
*/
if (!atomic_read(&cmci_storm_on_cpus)) {
__this_cpu_write(cmci_storm_state, CMCI_STORM_NONE);
@@ -130,10 +150,8 @@ unsigned long mce_intel_adjust_timer(unsigned long interval)
}
return CMCI_POLL_INTERVAL;
default:
- /*
- * We have shiny weather. Let the poll do whatever it
- * thinks.
- */
+
+ /* We have shiny weather. Let the poll do whatever it thinks. */
return interval;
}
}
@@ -178,7 +196,8 @@ static bool cmci_storm_detect(void)
cmci_storm_disable_banks();
__this_cpu_write(cmci_storm_state, CMCI_STORM_ACTIVE);
r = atomic_add_return(1, &cmci_storm_on_cpus);
- mce_timer_kick(CMCI_POLL_INTERVAL);
+ mce_timer_kick(CMCI_STORM_INTERVAL);
+ this_cpu_write(cmci_backoff_cnt, INITIAL_CHECK_INTERVAL);
if (r == 1)
pr_notice("CMCI storm detected: switching to poll mode\n");
@@ -195,6 +214,7 @@ static void intel_threshold_interrupt(void)
{
if (cmci_storm_detect())
return;
+
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
mce_notify_irq();
}
@@ -286,6 +306,7 @@ void cmci_recheck(void)
if (!mce_available(raw_cpu_ptr(&cpu_info)) || !cmci_supported(&banks))
return;
+
local_irq_save(flags);
machine_check_poll(MCP_TIMESTAMP, this_cpu_ptr(&mce_banks_owned));
local_irq_restore(flags);
diff --git a/arch/x86/kernel/cpu/microcode/amd.c b/arch/x86/kernel/cpu/microcode/amd.c
index bfbbe61..12829c3 100644
--- a/arch/x86/kernel/cpu/microcode/amd.c
+++ b/arch/x86/kernel/cpu/microcode/amd.c
@@ -21,7 +21,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/firmware.h>
-#include <linux/pci_ids.h>
#include <linux/uaccess.h>
#include <linux/vmalloc.h>
#include <linux/kernel.h>
diff --git a/arch/x86/kernel/cpu/microcode/core_early.c b/arch/x86/kernel/cpu/microcode/core_early.c
index d45df4bd..a413a69 100644
--- a/arch/x86/kernel/cpu/microcode/core_early.c
+++ b/arch/x86/kernel/cpu/microcode/core_early.c
@@ -23,57 +23,6 @@
#include <asm/processor.h>
#include <asm/cmdline.h>
-#define QCHAR(a, b, c, d) ((a) + ((b) << 8) + ((c) << 16) + ((d) << 24))
-#define CPUID_INTEL1 QCHAR('G', 'e', 'n', 'u')
-#define CPUID_INTEL2 QCHAR('i', 'n', 'e', 'I')
-#define CPUID_INTEL3 QCHAR('n', 't', 'e', 'l')
-#define CPUID_AMD1 QCHAR('A', 'u', 't', 'h')
-#define CPUID_AMD2 QCHAR('e', 'n', 't', 'i')
-#define CPUID_AMD3 QCHAR('c', 'A', 'M', 'D')
-
-#define CPUID_IS(a, b, c, ebx, ecx, edx) \
- (!((ebx ^ (a))|(edx ^ (b))|(ecx ^ (c))))
-
-/*
- * In early loading microcode phase on BSP, boot_cpu_data is not set up yet.
- * x86_vendor() gets vendor id for BSP.
- *
- * In 32 bit AP case, accessing boot_cpu_data needs linear address. To simplify
- * coding, we still use x86_vendor() to get vendor id for AP.
- *
- * x86_vendor() gets vendor information directly through cpuid.
- */
-static int x86_vendor(void)
-{
- u32 eax = 0x00000000;
- u32 ebx, ecx = 0, edx;
-
- native_cpuid(&eax, &ebx, &ecx, &edx);
-
- if (CPUID_IS(CPUID_INTEL1, CPUID_INTEL2, CPUID_INTEL3, ebx, ecx, edx))
- return X86_VENDOR_INTEL;
-
- if (CPUID_IS(CPUID_AMD1, CPUID_AMD2, CPUID_AMD3, ebx, ecx, edx))
- return X86_VENDOR_AMD;
-
- return X86_VENDOR_UNKNOWN;
-}
-
-static int x86_family(void)
-{
- u32 eax = 0x00000001;
- u32 ebx, ecx = 0, edx;
- int x86;
-
- native_cpuid(&eax, &ebx, &ecx, &edx);
-
- x86 = (eax >> 8) & 0xf;
- if (x86 == 15)
- x86 += (eax >> 20) & 0xff;
-
- return x86;
-}
-
static bool __init check_loader_disabled_bsp(void)
{
#ifdef CONFIG_X86_32
@@ -96,7 +45,7 @@ static bool __init check_loader_disabled_bsp(void)
void __init load_ucode_bsp(void)
{
- int vendor, x86;
+ int vendor, family;
if (check_loader_disabled_bsp())
return;
@@ -105,15 +54,15 @@ void __init load_ucode_bsp(void)
return;
vendor = x86_vendor();
- x86 = x86_family();
+ family = x86_family();
switch (vendor) {
case X86_VENDOR_INTEL:
- if (x86 >= 6)
+ if (family >= 6)
load_ucode_intel_bsp();
break;
case X86_VENDOR_AMD:
- if (x86 >= 0x10)
+ if (family >= 0x10)
load_ucode_amd_bsp();
break;
default:
@@ -132,7 +81,7 @@ static bool check_loader_disabled_ap(void)
void load_ucode_ap(void)
{
- int vendor, x86;
+ int vendor, family;
if (check_loader_disabled_ap())
return;
@@ -141,15 +90,15 @@ void load_ucode_ap(void)
return;
vendor = x86_vendor();
- x86 = x86_family();
+ family = x86_family();
switch (vendor) {
case X86_VENDOR_INTEL:
- if (x86 >= 6)
+ if (family >= 6)
load_ucode_intel_ap();
break;
case X86_VENDOR_AMD:
- if (x86 >= 0x10)
+ if (family >= 0x10)
load_ucode_amd_ap();
break;
default:
@@ -179,18 +128,18 @@ int __init save_microcode_in_initrd(void)
void reload_early_microcode(void)
{
- int vendor, x86;
+ int vendor, family;
vendor = x86_vendor();
- x86 = x86_family();
+ family = x86_family();
switch (vendor) {
case X86_VENDOR_INTEL:
- if (x86 >= 6)
+ if (family >= 6)
reload_ucode_intel();
break;
case X86_VENDOR_AMD:
- if (x86 >= 0x10)
+ if (family >= 0x10)
reload_ucode_amd();
break;
default:
diff --git a/arch/x86/kernel/cpu/microcode/intel.c b/arch/x86/kernel/cpu/microcode/intel.c
index 746e7fd..a41bead 100644
--- a/arch/x86/kernel/cpu/microcode/intel.c
+++ b/arch/x86/kernel/cpu/microcode/intel.c
@@ -124,7 +124,7 @@ static int get_matching_mc(struct microcode_intel *mc_intel, int cpu)
cpf = cpu_sig.pf;
crev = cpu_sig.rev;
- return get_matching_microcode(csig, cpf, mc_intel, crev);
+ return get_matching_microcode(csig, cpf, crev, mc_intel);
}
static int apply_microcode_intel(int cpu)
@@ -226,7 +226,7 @@ static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size,
csig = uci->cpu_sig.sig;
cpf = uci->cpu_sig.pf;
- if (get_matching_microcode(csig, cpf, mc, new_rev)) {
+ if (get_matching_microcode(csig, cpf, new_rev, mc)) {
vfree(new_mc);
new_rev = mc_header.rev;
new_mc = mc;
diff --git a/arch/x86/kernel/cpu/microcode/intel_early.c b/arch/x86/kernel/cpu/microcode/intel_early.c
index 420eb93..2f49ab4 100644
--- a/arch/x86/kernel/cpu/microcode/intel_early.c
+++ b/arch/x86/kernel/cpu/microcode/intel_early.c
@@ -16,6 +16,14 @@
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*/
+
+/*
+ * This needs to be before all headers so that pr_debug in printk.h doesn't turn
+ * printk calls into no_printk().
+ *
+ *#define DEBUG
+ */
+
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/slab.h>
@@ -28,6 +36,9 @@
#include <asm/tlbflush.h>
#include <asm/setup.h>
+#undef pr_fmt
+#define pr_fmt(fmt) "microcode: " fmt
+
static unsigned long mc_saved_in_initrd[MAX_UCODE_COUNT];
static struct mc_saved_data {
unsigned int mc_saved_count;
@@ -35,50 +46,45 @@ static struct mc_saved_data {
} mc_saved_data;
static enum ucode_state
-generic_load_microcode_early(struct microcode_intel **mc_saved_p,
- unsigned int mc_saved_count,
- struct ucode_cpu_info *uci)
+load_microcode_early(struct microcode_intel **saved,
+ unsigned int num_saved, struct ucode_cpu_info *uci)
{
struct microcode_intel *ucode_ptr, *new_mc = NULL;
- int new_rev = uci->cpu_sig.rev;
- enum ucode_state state = UCODE_OK;
- unsigned int mc_size;
- struct microcode_header_intel *mc_header;
- unsigned int csig = uci->cpu_sig.sig;
- unsigned int cpf = uci->cpu_sig.pf;
- int i;
+ struct microcode_header_intel *mc_hdr;
+ int new_rev, ret, i;
- for (i = 0; i < mc_saved_count; i++) {
- ucode_ptr = mc_saved_p[i];
+ new_rev = uci->cpu_sig.rev;
- mc_header = (struct microcode_header_intel *)ucode_ptr;
- mc_size = get_totalsize(mc_header);
- if (get_matching_microcode(csig, cpf, ucode_ptr, new_rev)) {
- new_rev = mc_header->rev;
- new_mc = ucode_ptr;
- }
- }
+ for (i = 0; i < num_saved; i++) {
+ ucode_ptr = saved[i];
+ mc_hdr = (struct microcode_header_intel *)ucode_ptr;
- if (!new_mc) {
- state = UCODE_NFOUND;
- goto out;
+ ret = get_matching_microcode(uci->cpu_sig.sig,
+ uci->cpu_sig.pf,
+ new_rev,
+ ucode_ptr);
+ if (!ret)
+ continue;
+
+ new_rev = mc_hdr->rev;
+ new_mc = ucode_ptr;
}
+ if (!new_mc)
+ return UCODE_NFOUND;
+
uci->mc = (struct microcode_intel *)new_mc;
-out:
- return state;
+ return UCODE_OK;
}
-static void
-microcode_pointer(struct microcode_intel **mc_saved,
- unsigned long *mc_saved_in_initrd,
- unsigned long initrd_start, int mc_saved_count)
+static inline void
+copy_initrd_ptrs(struct microcode_intel **mc_saved, unsigned long *initrd,
+ unsigned long off, int num_saved)
{
int i;
- for (i = 0; i < mc_saved_count; i++)
- mc_saved[i] = (struct microcode_intel *)
- (mc_saved_in_initrd[i] + initrd_start);
+ for (i = 0; i < num_saved; i++)
+ mc_saved[i] = (struct microcode_intel *)(initrd[i] + off);
}
#ifdef CONFIG_X86_32
@@ -102,55 +108,27 @@ microcode_phys(struct microcode_intel **mc_saved_tmp,
#endif
static enum ucode_state
-load_microcode(struct mc_saved_data *mc_saved_data,
- unsigned long *mc_saved_in_initrd,
- unsigned long initrd_start,
- struct ucode_cpu_info *uci)
+load_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+ unsigned long initrd_start, struct ucode_cpu_info *uci)
{
struct microcode_intel *mc_saved_tmp[MAX_UCODE_COUNT];
unsigned int count = mc_saved_data->mc_saved_count;
if (!mc_saved_data->mc_saved) {
- microcode_pointer(mc_saved_tmp, mc_saved_in_initrd,
- initrd_start, count);
+ copy_initrd_ptrs(mc_saved_tmp, initrd, initrd_start, count);
- return generic_load_microcode_early(mc_saved_tmp, count, uci);
+ return load_microcode_early(mc_saved_tmp, count, uci);
} else {
#ifdef CONFIG_X86_32
microcode_phys(mc_saved_tmp, mc_saved_data);
- return generic_load_microcode_early(mc_saved_tmp, count, uci);
+ return load_microcode_early(mc_saved_tmp, count, uci);
#else
- return generic_load_microcode_early(mc_saved_data->mc_saved,
+ return load_microcode_early(mc_saved_data->mc_saved,
count, uci);
#endif
}
}
-static u8 get_x86_family(unsigned long sig)
-{
- u8 x86;
-
- x86 = (sig >> 8) & 0xf;
-
- if (x86 == 0xf)
- x86 += (sig >> 20) & 0xff;
-
- return x86;
-}
-
-static u8 get_x86_model(unsigned long sig)
-{
- u8 x86, x86_model;
-
- x86 = get_x86_family(sig);
- x86_model = (sig >> 4) & 0xf;
-
- if (x86 == 0x6 || x86 == 0xf)
- x86_model += ((sig >> 16) & 0xf) << 4;
-
- return x86_model;
-}
-
/*
* Given CPU signature and a microcode patch, this function finds if the
* microcode patch has matching family and model with the CPU.
@@ -159,42 +137,40 @@ static enum ucode_state
matching_model_microcode(struct microcode_header_intel *mc_header,
unsigned long sig)
{
- u8 x86, x86_model;
- u8 x86_ucode, x86_model_ucode;
+ unsigned int fam, model;
+ unsigned int fam_ucode, model_ucode;
struct extended_sigtable *ext_header;
unsigned long total_size = get_totalsize(mc_header);
unsigned long data_size = get_datasize(mc_header);
int ext_sigcount, i;
struct extended_signature *ext_sig;
- x86 = get_x86_family(sig);
- x86_model = get_x86_model(sig);
+ fam = __x86_family(sig);
+ model = x86_model(sig);
- x86_ucode = get_x86_family(mc_header->sig);
- x86_model_ucode = get_x86_model(mc_header->sig);
+ fam_ucode = __x86_family(mc_header->sig);
+ model_ucode = x86_model(mc_header->sig);
- if (x86 == x86_ucode && x86_model == x86_model_ucode)
+ if (fam == fam_ucode && model == model_ucode)
return UCODE_OK;
/* Look for ext. headers: */
if (total_size <= data_size + MC_HEADER_SIZE)
return UCODE_NFOUND;
- ext_header = (struct extended_sigtable *)
- mc_header + data_size + MC_HEADER_SIZE;
+ ext_header = (void *) mc_header + data_size + MC_HEADER_SIZE;
+ ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
ext_sigcount = ext_header->count;
- ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
for (i = 0; i < ext_sigcount; i++) {
- x86_ucode = get_x86_family(ext_sig->sig);
- x86_model_ucode = get_x86_model(ext_sig->sig);
+ fam_ucode = __x86_family(ext_sig->sig);
+ model_ucode = x86_model(ext_sig->sig);
- if (x86 == x86_ucode && x86_model == x86_model_ucode)
+ if (fam == fam_ucode && model == model_ucode)
return UCODE_OK;
ext_sig++;
}
-
return UCODE_NFOUND;
}
@@ -204,7 +180,7 @@ save_microcode(struct mc_saved_data *mc_saved_data,
unsigned int mc_saved_count)
{
int i, j;
- struct microcode_intel **mc_saved_p;
+ struct microcode_intel **saved_ptr;
int ret;
if (!mc_saved_count)
@@ -213,39 +189,45 @@ save_microcode(struct mc_saved_data *mc_saved_data,
/*
* Copy new microcode data.
*/
- mc_saved_p = kmalloc(mc_saved_count*sizeof(struct microcode_intel *),
- GFP_KERNEL);
- if (!mc_saved_p)
+ saved_ptr = kcalloc(mc_saved_count, sizeof(struct microcode_intel *), GFP_KERNEL);
+ if (!saved_ptr)
return -ENOMEM;
for (i = 0; i < mc_saved_count; i++) {
- struct microcode_intel *mc = mc_saved_src[i];
- struct microcode_header_intel *mc_header = &mc->hdr;
- unsigned long mc_size = get_totalsize(mc_header);
- mc_saved_p[i] = kmalloc(mc_size, GFP_KERNEL);
- if (!mc_saved_p[i]) {
- ret = -ENOMEM;
- goto err;
- }
+ struct microcode_header_intel *mc_hdr;
+ struct microcode_intel *mc;
+ unsigned long size;
+
if (!mc_saved_src[i]) {
ret = -EINVAL;
goto err;
}
- memcpy(mc_saved_p[i], mc, mc_size);
+
+ mc = mc_saved_src[i];
+ mc_hdr = &mc->hdr;
+ size = get_totalsize(mc_hdr);
+
+ saved_ptr[i] = kmalloc(size, GFP_KERNEL);
+ if (!saved_ptr[i]) {
+ ret = -ENOMEM;
+ goto err;
+ }
+
+ memcpy(saved_ptr[i], mc, size);
}
/*
* Point to newly saved microcode.
*/
- mc_saved_data->mc_saved = mc_saved_p;
+ mc_saved_data->mc_saved = saved_ptr;
mc_saved_data->mc_saved_count = mc_saved_count;
return 0;
err:
for (j = 0; j <= i; j++)
- kfree(mc_saved_p[j]);
- kfree(mc_saved_p);
+ kfree(saved_ptr[j]);
+ kfree(saved_ptr);
return ret;
}
@@ -257,48 +239,45 @@ err:
* - or if it is a newly discovered microcode patch.
*
* The microcode patch should have matching model with CPU.
+ *
+ * Returns: The updated number @num_saved of saved microcode patches.
*/
-static void _save_mc(struct microcode_intel **mc_saved, u8 *ucode_ptr,
- unsigned int *mc_saved_count_p)
+static unsigned int _save_mc(struct microcode_intel **mc_saved,
+ u8 *ucode_ptr, unsigned int num_saved)
{
- int i;
- int found = 0;
- unsigned int mc_saved_count = *mc_saved_count_p;
- struct microcode_header_intel *mc_header;
+ struct microcode_header_intel *mc_hdr, *mc_saved_hdr;
+ unsigned int sig, pf, new_rev;
+ int found = 0, i;
+
+ mc_hdr = (struct microcode_header_intel *)ucode_ptr;
+
+ for (i = 0; i < num_saved; i++) {
+ mc_saved_hdr = (struct microcode_header_intel *)mc_saved[i];
+ sig = mc_saved_hdr->sig;
+ pf = mc_saved_hdr->pf;
+ new_rev = mc_hdr->rev;
+
+ if (!get_matching_sig(sig, pf, new_rev, ucode_ptr))
+ continue;
+
+ found = 1;
+
+ if (!revision_is_newer(mc_hdr, new_rev))
+ continue;
- mc_header = (struct microcode_header_intel *)ucode_ptr;
- for (i = 0; i < mc_saved_count; i++) {
- unsigned int sig, pf;
- unsigned int new_rev;
- struct microcode_header_intel *mc_saved_header =
- (struct microcode_header_intel *)mc_saved[i];
- sig = mc_saved_header->sig;
- pf = mc_saved_header->pf;
- new_rev = mc_header->rev;
-
- if (get_matching_sig(sig, pf, ucode_ptr, new_rev)) {
- found = 1;
- if (update_match_revision(mc_header, new_rev)) {
- /*
- * Found an older ucode saved before.
- * Replace the older one with this newer
- * one.
- */
- mc_saved[i] =
- (struct microcode_intel *)ucode_ptr;
- break;
- }
- }
- }
- if (i >= mc_saved_count && !found)
/*
- * This ucode is first time discovered in ucode file.
- * Save it to memory.
+ * Found an older ucode saved earlier. Replace it with
+ * this newer one.
*/
- mc_saved[mc_saved_count++] =
- (struct microcode_intel *)ucode_ptr;
+ mc_saved[i] = (struct microcode_intel *)ucode_ptr;
+ break;
+ }
+
+ /* Newly detected microcode, save it to memory. */
+ if (i >= num_saved && !found)
+ mc_saved[num_saved++] = (struct microcode_intel *)ucode_ptr;
- *mc_saved_count_p = mc_saved_count;
+ return num_saved;
}
/*
@@ -346,7 +325,7 @@ get_matching_model_microcode(int cpu, unsigned long start,
continue;
}
- _save_mc(mc_saved_tmp, ucode_ptr, &mc_saved_count);
+ mc_saved_count = _save_mc(mc_saved_tmp, ucode_ptr, mc_saved_count);
ucode_ptr += mc_size;
}
@@ -372,7 +351,7 @@ out:
static int collect_cpu_info_early(struct ucode_cpu_info *uci)
{
unsigned int val[2];
- u8 x86, x86_model;
+ unsigned int family, model;
struct cpu_signature csig;
unsigned int eax, ebx, ecx, edx;
@@ -387,10 +366,10 @@ static int collect_cpu_info_early(struct ucode_cpu_info *uci)
native_cpuid(&eax, &ebx, &ecx, &edx);
csig.sig = eax;
- x86 = get_x86_family(csig.sig);
- x86_model = get_x86_model(csig.sig);
+ family = __x86_family(csig.sig);
+ model = x86_model(csig.sig);
- if ((x86_model >= 5) || (x86 > 6)) {
+ if ((model >= 5) || (family > 6)) {
/* get processor flags from MSR 0x17 */
native_rdmsr(MSR_IA32_PLATFORM_ID, val[0], val[1]);
csig.pf = 1 << ((val[1] >> 18) & 7);
@@ -429,8 +408,7 @@ static void __ref show_saved_mc(void)
sig = uci.cpu_sig.sig;
pf = uci.cpu_sig.pf;
rev = uci.cpu_sig.rev;
- pr_debug("CPU%d: sig=0x%x, pf=0x%x, rev=0x%x\n",
- smp_processor_id(), sig, pf, rev);
+ pr_debug("CPU: sig=0x%x, pf=0x%x, rev=0x%x\n", sig, pf, rev);
for (i = 0; i < mc_saved_data.mc_saved_count; i++) {
struct microcode_header_intel *mc_saved_header;
@@ -457,8 +435,7 @@ static void __ref show_saved_mc(void)
if (total_size <= data_size + MC_HEADER_SIZE)
continue;
- ext_header = (struct extended_sigtable *)
- mc_saved_header + data_size + MC_HEADER_SIZE;
+ ext_header = (void *) mc_saved_header + data_size + MC_HEADER_SIZE;
ext_sigcount = ext_header->count;
ext_sig = (void *)ext_header + EXT_HEADER_SIZE;
@@ -515,8 +492,7 @@ int save_mc_for_early(u8 *mc)
* Save the microcode patch mc in mc_save_tmp structure if it's a newer
* version.
*/
-
- _save_mc(mc_saved_tmp, mc, &mc_saved_count);
+ mc_saved_count = _save_mc(mc_saved_tmp, mc, mc_saved_count);
/*
* Save the mc_save_tmp in global mc_saved_data.
@@ -548,12 +524,10 @@ EXPORT_SYMBOL_GPL(save_mc_for_early);
static __initdata char ucode_name[] = "kernel/x86/microcode/GenuineIntel.bin";
static __init enum ucode_state
-scan_microcode(unsigned long start, unsigned long end,
- struct mc_saved_data *mc_saved_data,
- unsigned long *mc_saved_in_initrd,
- struct ucode_cpu_info *uci)
+scan_microcode(struct mc_saved_data *mc_saved_data, unsigned long *initrd,
+ unsigned long start, unsigned long size,
+ struct ucode_cpu_info *uci)
{
- unsigned int size = end - start + 1;
struct cpio_data cd;
long offset = 0;
#ifdef CONFIG_X86_32
@@ -569,10 +543,8 @@ scan_microcode(unsigned long start, unsigned long end,
if (!cd.data)
return UCODE_ERROR;
-
return get_matching_model_microcode(0, start, cd.data, cd.size,
- mc_saved_data, mc_saved_in_initrd,
- uci);
+ mc_saved_data, initrd, uci);
}
/*
@@ -704,7 +676,7 @@ int __init save_microcode_in_initrd_intel(void)
if (count == 0)
return ret;
- microcode_pointer(mc_saved, mc_saved_in_initrd, initrd_start, count);
+ copy_initrd_ptrs(mc_saved, mc_saved_in_initrd, initrd_start, count);
ret = save_microcode(&mc_saved_data, mc_saved, count);
if (ret)
pr_err("Cannot save microcode patches from initrd.\n");
@@ -716,52 +688,44 @@ int __init save_microcode_in_initrd_intel(void)
static void __init
_load_ucode_intel_bsp(struct mc_saved_data *mc_saved_data,
- unsigned long *mc_saved_in_initrd,
- unsigned long initrd_start_early,
- unsigned long initrd_end_early,
- struct ucode_cpu_info *uci)
+ unsigned long *initrd,
+ unsigned long start, unsigned long size)
{
+ struct ucode_cpu_info uci;
enum ucode_state ret;
- collect_cpu_info_early(uci);
- scan_microcode(initrd_start_early, initrd_end_early, mc_saved_data,
- mc_saved_in_initrd, uci);
+ collect_cpu_info_early(&uci);
- ret = load_microcode(mc_saved_data, mc_saved_in_initrd,
- initrd_start_early, uci);
+ ret = scan_microcode(mc_saved_data, initrd, start, size, &uci);
+ if (ret != UCODE_OK)
+ return;
- if (ret == UCODE_OK)
- apply_microcode_early(uci, true);
+ ret = load_microcode(mc_saved_data, initrd, start, &uci);
+ if (ret != UCODE_OK)
+ return;
+
+ apply_microcode_early(&uci, true);
}
-void __init
-load_ucode_intel_bsp(void)
+void __init load_ucode_intel_bsp(void)
{
- u64 ramdisk_image, ramdisk_size;
- unsigned long initrd_start_early, initrd_end_early;
- struct ucode_cpu_info uci;
+ u64 start, size;
#ifdef CONFIG_X86_32
- struct boot_params *boot_params_p;
+ struct boot_params *p;
- boot_params_p = (struct boot_params *)__pa_nodebug(&boot_params);
- ramdisk_image = boot_params_p->hdr.ramdisk_image;
- ramdisk_size = boot_params_p->hdr.ramdisk_size;
- initrd_start_early = ramdisk_image;
- initrd_end_early = initrd_start_early + ramdisk_size;
+ p = (struct boot_params *)__pa_nodebug(&boot_params);
+ start = p->hdr.ramdisk_image;
+ size = p->hdr.ramdisk_size;
_load_ucode_intel_bsp(
- (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
- (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
- initrd_start_early, initrd_end_early, &uci);
+ (struct mc_saved_data *)__pa_nodebug(&mc_saved_data),
+ (unsigned long *)__pa_nodebug(&mc_saved_in_initrd),
+ start, size);
#else
- ramdisk_image = boot_params.hdr.ramdisk_image;
- ramdisk_size = boot_params.hdr.ramdisk_size;
- initrd_start_early = ramdisk_image + PAGE_OFFSET;
- initrd_end_early = initrd_start_early + ramdisk_size;
-
- _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd,
- initrd_start_early, initrd_end_early,
- &uci);
+ start = boot_params.hdr.ramdisk_image + PAGE_OFFSET;
+ size = boot_params.hdr.ramdisk_size;
+
+ _load_ucode_intel_bsp(&mc_saved_data, mc_saved_in_initrd, start, size);
#endif
}
@@ -771,6 +735,7 @@ void load_ucode_intel_ap(void)
struct ucode_cpu_info uci;
unsigned long *mc_saved_in_initrd_p;
unsigned long initrd_start_addr;
+ enum ucode_state ret;
#ifdef CONFIG_X86_32
unsigned long *initrd_start_p;
@@ -793,8 +758,12 @@ void load_ucode_intel_ap(void)
return;
collect_cpu_info_early(&uci);
- load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
- initrd_start_addr, &uci);
+ ret = load_microcode(mc_saved_data_p, mc_saved_in_initrd_p,
+ initrd_start_addr, &uci);
+
+ if (ret != UCODE_OK)
+ return;
+
apply_microcode_early(&uci, true);
}
@@ -808,8 +777,8 @@ void reload_ucode_intel(void)
collect_cpu_info_early(&uci);
- ret = generic_load_microcode_early(mc_saved_data.mc_saved,
- mc_saved_data.mc_saved_count, &uci);
+ ret = load_microcode_early(mc_saved_data.mc_saved,
+ mc_saved_data.mc_saved_count, &uci);
if (ret != UCODE_OK)
return;
diff --git a/arch/x86/kernel/cpu/microcode/intel_lib.c b/arch/x86/kernel/cpu/microcode/intel_lib.c
index ce69320..cd47a51 100644
--- a/arch/x86/kernel/cpu/microcode/intel_lib.c
+++ b/arch/x86/kernel/cpu/microcode/intel_lib.c
@@ -38,12 +38,6 @@ update_match_cpu(unsigned int csig, unsigned int cpf,
return (!sigmatch(sig, csig, pf, cpf)) ? 0 : 1;
}
-int
-update_match_revision(struct microcode_header_intel *mc_header, int rev)
-{
- return (mc_header->rev <= rev) ? 0 : 1;
-}
-
int microcode_sanity_check(void *mc, int print_err)
{
unsigned long total_size, data_size, ext_table_size;
@@ -128,10 +122,9 @@ int microcode_sanity_check(void *mc, int print_err)
EXPORT_SYMBOL_GPL(microcode_sanity_check);
/*
- * return 0 - no update found
- * return 1 - found update
+ * Returns 1 if update has been found, 0 otherwise.
*/
-int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
+int get_matching_sig(unsigned int csig, int cpf, int rev, void *mc)
{
struct microcode_header_intel *mc_header = mc;
struct extended_sigtable *ext_header;
@@ -159,16 +152,15 @@ int get_matching_sig(unsigned int csig, int cpf, void *mc, int rev)
}
/*
- * return 0 - no update found
- * return 1 - found update
+ * Returns 1 if update has been found, 0 otherwise.
*/
-int get_matching_microcode(unsigned int csig, int cpf, void *mc, int rev)
+int get_matching_microcode(unsigned int csig, int cpf, int rev, void *mc)
{
- struct microcode_header_intel *mc_header = mc;
+ struct microcode_header_intel *mc_hdr = mc;
- if (!update_match_revision(mc_header, rev))
+ if (!revision_is_newer(mc_hdr, rev))
return 0;
- return get_matching_sig(csig, cpf, mc, rev);
+ return get_matching_sig(csig, cpf, rev, mc);
}
EXPORT_SYMBOL_GPL(get_matching_microcode);
diff --git a/arch/x86/kernel/cpu/mkcapflags.sh b/arch/x86/kernel/cpu/mkcapflags.sh
index 36d99a3..3f20710 100644
--- a/arch/x86/kernel/cpu/mkcapflags.sh
+++ b/arch/x86/kernel/cpu/mkcapflags.sh
@@ -6,7 +6,7 @@
IN=$1
OUT=$2
-function dump_array()
+dump_array()
{
ARRAY=$1
SIZE=$2
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index 689e357..87848eb 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2236,24 +2236,24 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs)
static unsigned long code_segment_base(struct pt_regs *regs)
{
/*
+ * For IA32 we look at the GDT/LDT segment base to convert the
+ * effective IP to a linear address.
+ */
+
+#ifdef CONFIG_X86_32
+ /*
* If we are in VM86 mode, add the segment offset to convert to a
* linear address.
*/
if (regs->flags & X86_VM_MASK)
return 0x10 * regs->cs;
- /*
- * For IA32 we look at the GDT/LDT segment base to convert the
- * effective IP to a linear address.
- */
-#ifdef CONFIG_X86_32
if (user_mode(regs) && regs->cs != __USER_CS)
return get_segment_base(regs->cs);
#else
- if (test_thread_flag(TIF_IA32)) {
- if (user_mode(regs) && regs->cs != __USER32_CS)
- return get_segment_base(regs->cs);
- }
+ if (user_mode(regs) && !user_64bit_mode(regs) &&
+ regs->cs != __USER32_CS)
+ return get_segment_base(regs->cs);
#endif
return 0;
}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index aceb2f9..c76d3e3 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -105,7 +105,7 @@ static void kdump_nmi_callback(int cpu, struct pt_regs *regs)
#ifdef CONFIG_X86_32
struct pt_regs fixed_regs;
- if (!user_mode_vm(regs)) {
+ if (!user_mode(regs)) {
crash_fixup_ss_esp(&fixed_regs, regs);
regs = &fixed_regs;
}
diff --git a/arch/x86/kernel/devicetree.c b/arch/x86/kernel/devicetree.c
index 3d35033..6367a78 100644
--- a/arch/x86/kernel/devicetree.c
+++ b/arch/x86/kernel/devicetree.c
@@ -286,13 +286,13 @@ static void __init x86_flattree_get_config(void)
initial_boot_params = dt = early_memremap(initial_dtb, map_len);
size = of_get_flat_dt_size();
if (map_len < size) {
- early_iounmap(dt, map_len);
+ early_memunmap(dt, map_len);
initial_boot_params = dt = early_memremap(initial_dtb, size);
map_len = size;
}
unflatten_and_copy_device_tree();
- early_iounmap(dt, map_len);
+ early_memunmap(dt, map_len);
}
#else
static inline void x86_flattree_get_config(void) { }
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index cf3df1d..9c30acf 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -25,10 +25,12 @@ unsigned int code_bytes = 64;
int kstack_depth_to_print = 3 * STACKSLOTS_PER_LINE;
static int die_counter;
-static void printk_stack_address(unsigned long address, int reliable)
+static void printk_stack_address(unsigned long address, int reliable,
+ void *data)
{
- pr_cont(" [<%p>] %s%pB\n",
- (void *)address, reliable ? "" : "? ", (void *)address);
+ printk("%s [<%p>] %s%pB\n",
+ (char *)data, (void *)address, reliable ? "" : "? ",
+ (void *)address);
}
void printk_address(unsigned long address)
@@ -155,8 +157,7 @@ static int print_trace_stack(void *data, char *name)
static void print_trace_address(void *data, unsigned long addr, int reliable)
{
touch_nmi_watchdog();
- printk(data);
- printk_stack_address(addr, reliable);
+ printk_stack_address(addr, reliable, data);
}
static const struct stacktrace_ops print_trace_ops = {
@@ -278,7 +279,7 @@ int __die(const char *str, struct pt_regs *regs, long err)
print_modules();
show_regs(regs);
#ifdef CONFIG_X86_32
- if (user_mode_vm(regs)) {
+ if (user_mode(regs)) {
sp = regs->sp;
ss = regs->ss & 0xffff;
} else {
@@ -307,7 +308,7 @@ void die(const char *str, struct pt_regs *regs, long err)
unsigned long flags = oops_begin();
int sig = SIGSEGV;
- if (!user_mode_vm(regs))
+ if (!user_mode(regs))
report_bug(regs->ip, regs);
if (__die(str, regs, err))
diff --git a/arch/x86/kernel/dumpstack_32.c b/arch/x86/kernel/dumpstack_32.c
index 5abd4cd..464ffd6 100644
--- a/arch/x86/kernel/dumpstack_32.c
+++ b/arch/x86/kernel/dumpstack_32.c
@@ -108,9 +108,12 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
for (i = 0; i < kstack_depth_to_print; i++) {
if (kstack_end(stack))
break;
- if (i && ((i % STACKSLOTS_PER_LINE) == 0))
- pr_cont("\n");
- pr_cont(" %08lx", *stack++);
+ if ((i % STACKSLOTS_PER_LINE) == 0) {
+ if (i != 0)
+ pr_cont("\n");
+ printk("%s %08lx", log_lvl, *stack++);
+ } else
+ pr_cont(" %08lx", *stack++);
touch_nmi_watchdog();
}
pr_cont("\n");
@@ -123,13 +126,13 @@ void show_regs(struct pt_regs *regs)
int i;
show_regs_print_info(KERN_EMERG);
- __show_regs(regs, !user_mode_vm(regs));
+ __show_regs(regs, !user_mode(regs));
/*
* When in-kernel, we also print out the stack and code at the
* time of the fault..
*/
- if (!user_mode_vm(regs)) {
+ if (!user_mode(regs)) {
unsigned int code_prologue = code_bytes * 43 / 64;
unsigned int code_len = code_bytes;
unsigned char c;
diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c
index ff86f19..5f1c626 100644
--- a/arch/x86/kernel/dumpstack_64.c
+++ b/arch/x86/kernel/dumpstack_64.c
@@ -280,12 +280,15 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs,
pr_cont(" <EOI> ");
}
} else {
- if (((long) stack & (THREAD_SIZE-1)) == 0)
+ if (kstack_end(stack))
break;
}
- if (i && ((i % STACKSLOTS_PER_LINE) == 0))
- pr_cont("\n");
- pr_cont(" %016lx", *stack++);
+ if ((i % STACKSLOTS_PER_LINE) == 0) {
+ if (i != 0)
+ pr_cont("\n");
+ printk("%s %016lx", log_lvl, *stack++);
+ } else
+ pr_cont(" %016lx", *stack++);
touch_nmi_watchdog();
}
preempt_enable();
diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c
index 46201de..7d46bb2 100644
--- a/arch/x86/kernel/e820.c
+++ b/arch/x86/kernel/e820.c
@@ -661,7 +661,7 @@ void __init parse_e820_ext(u64 phys_addr, u32 data_len)
extmap = (struct e820entry *)(sdata->data);
__append_e820_map(extmap, entries);
sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map);
- early_iounmap(sdata, data_len);
+ early_memunmap(sdata, data_len);
printk(KERN_INFO "e820: extended physical RAM map:\n");
e820_print_map("extended");
}
diff --git a/arch/x86/kernel/early_printk.c b/arch/x86/kernel/early_printk.c
index a62536a..49ff55e 100644
--- a/arch/x86/kernel/early_printk.c
+++ b/arch/x86/kernel/early_printk.c
@@ -95,20 +95,6 @@ static unsigned long early_serial_base = 0x3f8; /* ttyS0 */
#define DLL 0 /* Divisor Latch Low */
#define DLH 1 /* Divisor latch High */
-static void mem32_serial_out(unsigned long addr, int offset, int value)
-{
- uint32_t *vaddr = (uint32_t *)addr;
- /* shift implied by pointer type */
- writel(value, vaddr + offset);
-}
-
-static unsigned int mem32_serial_in(unsigned long addr, int offset)
-{
- uint32_t *vaddr = (uint32_t *)addr;
- /* shift implied by pointer type */
- return readl(vaddr + offset);
-}
-
static unsigned int io_serial_in(unsigned long addr, int offset)
{
return inb(addr + offset);
@@ -205,6 +191,20 @@ static __init void early_serial_init(char *s)
}
#ifdef CONFIG_PCI
+static void mem32_serial_out(unsigned long addr, int offset, int value)
+{
+ u32 *vaddr = (u32 *)addr;
+ /* shift implied by pointer type */
+ writel(value, vaddr + offset);
+}
+
+static unsigned int mem32_serial_in(unsigned long addr, int offset)
+{
+ u32 *vaddr = (u32 *)addr;
+ /* shift implied by pointer type */
+ return readl(vaddr + offset);
+}
+
/*
* early_pci_serial_init()
*
@@ -217,8 +217,8 @@ static __init void early_pci_serial_init(char *s)
unsigned divisor;
unsigned long baud = DEFAULT_BAUD;
u8 bus, slot, func;
- uint32_t classcode, bar0;
- uint16_t cmdreg;
+ u32 classcode, bar0;
+ u16 cmdreg;
char *e;
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 31e2d5b..1c30976 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -395,10 +395,13 @@ sysenter_past_esp:
/*CFI_REL_OFFSET cs, 0*/
/*
* Push current_thread_info()->sysenter_return to the stack.
- * A tiny bit of offset fixup is necessary - 4*4 means the 4 words
- * pushed above; +8 corresponds to copy_thread's esp0 setting.
+ * A tiny bit of offset fixup is necessary: TI_sysenter_return
+ * is relative to thread_info, which is at the bottom of the
+ * kernel stack page. 4*4 means the 4 words pushed above;
+ * TOP_OF_KERNEL_STACK_PADDING takes us to the top of the stack;
+ * and THREAD_SIZE takes us to the bottom.
*/
- pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
+ pushl_cfi ((TI_sysenter_return) - THREAD_SIZE + TOP_OF_KERNEL_STACK_PADDING + 4*4)(%esp)
CFI_REL_OFFSET eip, 0
pushl_cfi %eax
@@ -432,7 +435,7 @@ sysenter_after_call:
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testl $_TIF_ALLWORK_MASK, %ecx
- jne sysexit_audit
+ jnz sysexit_audit
sysenter_exit:
/* if something modifies registers it must also disable sysexit */
movl PT_EIP(%esp), %edx
@@ -460,7 +463,7 @@ sysenter_audit:
sysexit_audit:
testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
- jne syscall_exit_work
+ jnz syscall_exit_work
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_ANY)
movl %eax,%edx /* second arg, syscall return value */
@@ -472,7 +475,7 @@ sysexit_audit:
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT), %ecx
- jne syscall_exit_work
+ jnz syscall_exit_work
movl PT_EAX(%esp),%eax /* reload syscall return value */
jmp sysenter_exit
#endif
@@ -510,7 +513,7 @@ syscall_exit:
TRACE_IRQS_OFF
movl TI_flags(%ebp), %ecx
testl $_TIF_ALLWORK_MASK, %ecx # current->work
- jne syscall_exit_work
+ jnz syscall_exit_work
restore_all:
TRACE_IRQS_IRET
@@ -612,7 +615,7 @@ work_notifysig: # deal with pending signals and
#ifdef CONFIG_VM86
testl $X86_EFLAGS_VM, PT_EFLAGS(%esp)
movl %esp, %eax
- jne work_notifysig_v86 # returning to kernel-space or
+ jnz work_notifysig_v86 # returning to kernel-space or
# vm86-space
1:
#else
@@ -720,43 +723,22 @@ END(sysenter_badsys)
.endm
/*
- * Build the entry stubs and pointer table with some assembler magic.
- * We pack 7 stubs into a single 32-byte chunk, which will fit in a
- * single cache line on all modern x86 implementations.
+ * Build the entry stubs with some assembler magic.
+ * We pack 1 stub into every 8-byte block.
*/
-.section .init.rodata,"a"
-ENTRY(interrupt)
-.section .entry.text, "ax"
- .p2align 5
- .p2align CONFIG_X86_L1_CACHE_SHIFT
+ .align 8
ENTRY(irq_entries_start)
RING0_INT_FRAME
-vector=FIRST_EXTERNAL_VECTOR
-.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7
- .balign 32
- .rept 7
- .if vector < FIRST_SYSTEM_VECTOR
- .if vector <> FIRST_EXTERNAL_VECTOR
+ vector=FIRST_EXTERNAL_VECTOR
+ .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+ pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
+ vector=vector+1
+ jmp common_interrupt
CFI_ADJUST_CFA_OFFSET -4
- .endif
-1: pushl_cfi $(~vector+0x80) /* Note: always in signed byte range */
- .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
- jmp 2f
- .endif
- .previous
- .long 1b
- .section .entry.text, "ax"
-vector=vector+1
- .endif
- .endr
-2: jmp common_interrupt
-.endr
+ .align 8
+ .endr
END(irq_entries_start)
-.previous
-END(interrupt)
-.previous
-
/*
* the CPU automatically disables interrupts when executing an IRQ vector,
* so IRQ-flags tracing has to follow that:
@@ -816,15 +798,9 @@ ENTRY(simd_coprocessor_error)
pushl_cfi $0
#ifdef CONFIG_X86_INVD_BUG
/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-661: pushl_cfi $do_general_protection
-662:
-.section .altinstructions,"a"
- altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
-.previous
-.section .altinstr_replacement,"ax"
-663: pushl $do_simd_coprocessor_error
-664:
-.previous
+ ALTERNATIVE "pushl_cfi $do_general_protection", \
+ "pushl $do_simd_coprocessor_error", \
+ X86_FEATURE_XMM
#else
pushl_cfi $do_simd_coprocessor_error
#endif
@@ -1240,20 +1216,13 @@ error_code:
/*CFI_REL_OFFSET es, 0*/
pushl_cfi %ds
/*CFI_REL_OFFSET ds, 0*/
- pushl_cfi %eax
- CFI_REL_OFFSET eax, 0
- pushl_cfi %ebp
- CFI_REL_OFFSET ebp, 0
- pushl_cfi %edi
- CFI_REL_OFFSET edi, 0
- pushl_cfi %esi
- CFI_REL_OFFSET esi, 0
- pushl_cfi %edx
- CFI_REL_OFFSET edx, 0
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx, 0
- pushl_cfi %ebx
- CFI_REL_OFFSET ebx, 0
+ pushl_cfi_reg eax
+ pushl_cfi_reg ebp
+ pushl_cfi_reg edi
+ pushl_cfi_reg esi
+ pushl_cfi_reg edx
+ pushl_cfi_reg ecx
+ pushl_cfi_reg ebx
cld
movl $(__KERNEL_PERCPU), %ecx
movl %ecx, %fs
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 2babb39..c7b2384 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -14,27 +14,14 @@
* NOTE: This code handles signal-recognition, which happens every time
* after an interrupt and after each system call.
*
- * Normal syscalls and interrupts don't save a full stack frame, this is
- * only done for syscall tracing, signals or fork/exec et.al.
- *
* A note on terminology:
- * - top of stack: Architecture defined interrupt frame from SS to RIP
+ * - iret frame: Architecture defined interrupt frame from SS to RIP
* at the top of the kernel process stack.
- * - partial stack frame: partially saved registers up to R11.
- * - full stack frame: Like partial stack frame, but all register saved.
*
* Some macro usage:
* - CFI macros are used to generate dwarf2 unwind information for better
* backtraces. They don't change any code.
- * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
- * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
- * There are unfortunately lots of special cases where some registers
- * not touched. The macro is a big mess that should be cleaned up.
- * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
- * Gives a full stack frame.
* - ENTRY/END Define functions in the symbol table.
- * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
- * frame that is otherwise undefined after a SYSCALL
* - TRACE_IRQ_* - Trace hard interrupt state for lock debugging.
* - idtentry - Define exception entry points.
*/
@@ -70,10 +57,6 @@
.section .entry.text, "ax"
-#ifndef CONFIG_PREEMPT
-#define retint_kernel retint_restore_args
-#endif
-
#ifdef CONFIG_PARAVIRT
ENTRY(native_usergs_sysret64)
swapgs
@@ -82,9 +65,9 @@ ENDPROC(native_usergs_sysret64)
#endif /* CONFIG_PARAVIRT */
-.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
+.macro TRACE_IRQS_IRETQ
#ifdef CONFIG_TRACE_IRQFLAGS
- bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
+ bt $9,EFLAGS(%rsp) /* interrupts off? */
jnc 1f
TRACE_IRQS_ON
1:
@@ -116,8 +99,8 @@ ENDPROC(native_usergs_sysret64)
call debug_stack_reset
.endm
-.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET
- bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
+.macro TRACE_IRQS_IRETQ_DEBUG
+ bt $9,EFLAGS(%rsp) /* interrupts off? */
jnc 1f
TRACE_IRQS_ON_DEBUG
1:
@@ -130,34 +113,7 @@ ENDPROC(native_usergs_sysret64)
#endif
/*
- * C code is not supposed to know about undefined top of stack. Every time
- * a C function with an pt_regs argument is called from the SYSCALL based
- * fast path FIXUP_TOP_OF_STACK is needed.
- * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
- * manipulation.
- */
-
- /* %rsp:at FRAMEEND */
- .macro FIXUP_TOP_OF_STACK tmp offset=0
- movq PER_CPU_VAR(old_rsp),\tmp
- movq \tmp,RSP+\offset(%rsp)
- movq $__USER_DS,SS+\offset(%rsp)
- movq $__USER_CS,CS+\offset(%rsp)
- movq RIP+\offset(%rsp),\tmp /* get rip */
- movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */
- movq R11+\offset(%rsp),\tmp /* get eflags */
- movq \tmp,EFLAGS+\offset(%rsp)
- .endm
-
- .macro RESTORE_TOP_OF_STACK tmp offset=0
- movq RSP+\offset(%rsp),\tmp
- movq \tmp,PER_CPU_VAR(old_rsp)
- movq EFLAGS+\offset(%rsp),\tmp
- movq \tmp,R11+\offset(%rsp)
- .endm
-
-/*
- * initial frame state for interrupts (and exceptions without error code)
+ * empty frame
*/
.macro EMPTY_FRAME start=1 offset=0
.if \start
@@ -173,12 +129,12 @@ ENDPROC(native_usergs_sysret64)
* initial frame state for interrupts (and exceptions without error code)
*/
.macro INTR_FRAME start=1 offset=0
- EMPTY_FRAME \start, SS+8+\offset-RIP
- /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
- CFI_REL_OFFSET rsp, RSP+\offset-RIP
- /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
- /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
- CFI_REL_OFFSET rip, RIP+\offset-RIP
+ EMPTY_FRAME \start, 5*8+\offset
+ /*CFI_REL_OFFSET ss, 4*8+\offset*/
+ CFI_REL_OFFSET rsp, 3*8+\offset
+ /*CFI_REL_OFFSET rflags, 2*8+\offset*/
+ /*CFI_REL_OFFSET cs, 1*8+\offset*/
+ CFI_REL_OFFSET rip, 0*8+\offset
.endm
/*
@@ -186,30 +142,23 @@ ENDPROC(native_usergs_sysret64)
* with vector already pushed)
*/
.macro XCPT_FRAME start=1 offset=0
- INTR_FRAME \start, RIP+\offset-ORIG_RAX
- .endm
-
-/*
- * frame that enables calling into C.
- */
- .macro PARTIAL_FRAME start=1 offset=0
- XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
- CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
- CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
- CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
- CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
- CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
- CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
- CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
- CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
- CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
+ INTR_FRAME \start, 1*8+\offset
.endm
/*
* frame that enables passing a complete pt_regs to a C function.
*/
.macro DEFAULT_FRAME start=1 offset=0
- PARTIAL_FRAME \start, R11+\offset-R15
+ XCPT_FRAME \start, ORIG_RAX+\offset
+ CFI_REL_OFFSET rdi, RDI+\offset
+ CFI_REL_OFFSET rsi, RSI+\offset
+ CFI_REL_OFFSET rdx, RDX+\offset
+ CFI_REL_OFFSET rcx, RCX+\offset
+ CFI_REL_OFFSET rax, RAX+\offset
+ CFI_REL_OFFSET r8, R8+\offset
+ CFI_REL_OFFSET r9, R9+\offset
+ CFI_REL_OFFSET r10, R10+\offset
+ CFI_REL_OFFSET r11, R11+\offset
CFI_REL_OFFSET rbx, RBX+\offset
CFI_REL_OFFSET rbp, RBP+\offset
CFI_REL_OFFSET r12, R12+\offset
@@ -218,105 +167,30 @@ ENDPROC(native_usergs_sysret64)
CFI_REL_OFFSET r15, R15+\offset
.endm
-ENTRY(save_paranoid)
- XCPT_FRAME 1 RDI+8
- cld
- movq %rdi, RDI+8(%rsp)
- movq %rsi, RSI+8(%rsp)
- movq_cfi rdx, RDX+8
- movq_cfi rcx, RCX+8
- movq_cfi rax, RAX+8
- movq %r8, R8+8(%rsp)
- movq %r9, R9+8(%rsp)
- movq %r10, R10+8(%rsp)
- movq %r11, R11+8(%rsp)
- movq_cfi rbx, RBX+8
- movq %rbp, RBP+8(%rsp)
- movq %r12, R12+8(%rsp)
- movq %r13, R13+8(%rsp)
- movq %r14, R14+8(%rsp)
- movq %r15, R15+8(%rsp)
- movl $1,%ebx
- movl $MSR_GS_BASE,%ecx
- rdmsr
- testl %edx,%edx
- js 1f /* negative -> in kernel */
- SWAPGS
- xorl %ebx,%ebx
-1: ret
- CFI_ENDPROC
-END(save_paranoid)
-
/*
- * A newly forked process directly context switches into this address.
+ * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
- * rdi: prev task we switched from
- */
-ENTRY(ret_from_fork)
- DEFAULT_FRAME
-
- LOCK ; btr $TIF_FORK,TI_flags(%r8)
-
- pushq_cfi $0x0002
- popfq_cfi # reset kernel eflags
-
- call schedule_tail # rdi: 'prev' task parameter
-
- GET_THREAD_INFO(%rcx)
-
- RESTORE_REST
-
- testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
- jz 1f
-
- /*
- * By the time we get here, we have no idea whether our pt_regs,
- * ti flags, and ti status came from the 64-bit SYSCALL fast path,
- * the slow path, or one of the ia32entry paths.
- * Use int_ret_from_sys_call to return, since it can safely handle
- * all of the above.
- */
- jmp int_ret_from_sys_call
-
-1:
- subq $REST_SKIP, %rsp # leave space for volatiles
- CFI_ADJUST_CFA_OFFSET REST_SKIP
- movq %rbp, %rdi
- call *%rbx
- movl $0, RAX(%rsp)
- RESTORE_REST
- jmp int_ret_from_sys_call
- CFI_ENDPROC
-END(ret_from_fork)
-
-/*
- * System call entry. Up to 6 arguments in registers are supported.
+ * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
*
- * SYSCALL does not save anything on the stack and does not change the
- * stack pointer. However, it does mask the flags register for us, so
- * CLD and CLAC are not needed.
- */
-
-/*
- * Register setup:
+ * Registers on entry:
* rax system call number
+ * rcx return address
+ * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
* rdi arg0
- * rcx return address for syscall/sysret, C arg3
* rsi arg1
* rdx arg2
- * r10 arg3 (--> moved to rcx for C)
+ * r10 arg3 (needs to be moved to rcx to conform to C ABI)
* r8 arg4
* r9 arg5
- * r11 eflags for syscall/sysret, temporary for C
- * r12-r15,rbp,rbx saved by C code, not touched.
+ * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
*
- * Interrupts are off on entry.
* Only called from user space.
*
- * XXX if we had a free scratch register we could save the RSP into the stack frame
- * and report it properly in ps. Unfortunately we haven't.
- *
- * When user can change the frames always force IRET. That is because
+ * When user can change pt_regs->foo always force IRET. That is because
* it deals with uncanonical addresses better. SYSRET has trouble
* with them due to bugs in both AMD and Intel CPUs.
*/
@@ -324,9 +198,15 @@ END(ret_from_fork)
ENTRY(system_call)
CFI_STARTPROC simple
CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,KERNEL_STACK_OFFSET
+ CFI_DEF_CFA rsp,0
CFI_REGISTER rip,rcx
/*CFI_REGISTER rflags,r11*/
+
+ /*
+ * Interrupts are off on entry.
+ * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+ * it is too small to ever cause noticeable irq latency.
+ */
SWAPGS_UNSAFE_STACK
/*
* A hypervisor implementation might want to use a label
@@ -335,18 +215,38 @@ ENTRY(system_call)
*/
GLOBAL(system_call_after_swapgs)
- movq %rsp,PER_CPU_VAR(old_rsp)
+ movq %rsp,PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(kernel_stack),%rsp
+
+ /* Construct struct pt_regs on stack */
+ pushq_cfi $__USER_DS /* pt_regs->ss */
+ pushq_cfi PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */
/*
- * No need to follow this irqs off/on section - it's straight
- * and short:
+ * Re-enable interrupts.
+ * We use 'rsp_scratch' as a scratch space, hence irq-off block above
+ * must execute atomically in the face of possible interrupt-driven
+ * task preemption. We must enable interrupts only after we're done
+ * with using rsp_scratch:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8, 0, rax_enosys=1
- movq_cfi rax,(ORIG_RAX-ARGOFFSET)
- movq %rcx,RIP-ARGOFFSET(%rsp)
- CFI_REL_OFFSET rip,RIP-ARGOFFSET
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ pushq_cfi %r11 /* pt_regs->flags */
+ pushq_cfi $__USER_CS /* pt_regs->cs */
+ pushq_cfi %rcx /* pt_regs->ip */
+ CFI_REL_OFFSET rip,0
+ pushq_cfi_reg rax /* pt_regs->orig_ax */
+ pushq_cfi_reg rdi /* pt_regs->di */
+ pushq_cfi_reg rsi /* pt_regs->si */
+ pushq_cfi_reg rdx /* pt_regs->dx */
+ pushq_cfi_reg rcx /* pt_regs->cx */
+ pushq_cfi $-ENOSYS /* pt_regs->ax */
+ pushq_cfi_reg r8 /* pt_regs->r8 */
+ pushq_cfi_reg r9 /* pt_regs->r9 */
+ pushq_cfi_reg r10 /* pt_regs->r10 */
+ pushq_cfi_reg r11 /* pt_regs->r11 */
+ sub $(6*8),%rsp /* pt_regs->bp,bx,r12-15 not saved */
+ CFI_ADJUST_CFA_OFFSET 6*8
+
+ testl $_TIF_WORK_SYSCALL_ENTRY, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
jnz tracesys
system_call_fastpath:
#if __SYSCALL_MASK == ~0
@@ -355,18 +255,21 @@ system_call_fastpath:
andl $__SYSCALL_MASK,%eax
cmpl $__NR_syscall_max,%eax
#endif
- ja ret_from_sys_call /* and return regs->ax */
+ ja 1f /* return -ENOSYS (already in pt_regs->ax) */
movq %r10,%rcx
- call *sys_call_table(,%rax,8) # XXX: rip relative
- movq %rax,RAX-ARGOFFSET(%rsp)
+ call *sys_call_table(,%rax,8)
+ movq %rax,RAX(%rsp)
+1:
/*
- * Syscall return path ending with SYSRET (fast path)
- * Has incomplete stack frame and undefined top of stack.
+ * Syscall return path ending with SYSRET (fast path).
+ * Has incompletely filled pt_regs.
*/
-ret_from_sys_call:
LOCKDEP_SYS_EXIT
+ /*
+ * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
+ * it is too small to ever cause noticeable irq latency.
+ */
DISABLE_INTERRUPTS(CLBR_NONE)
- TRACE_IRQS_OFF
/*
* We must check ti flags with interrupts (or at least preemption)
@@ -376,72 +279,73 @@ ret_from_sys_call:
* flags (TIF_NOTIFY_RESUME, TIF_USER_RETURN_NOTIFY, etc) set is
* very bad.
*/
- testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- jnz int_ret_from_sys_call_fixup /* Go the the slow path */
+ testl $_TIF_ALLWORK_MASK, ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS)
+ jnz int_ret_from_sys_call_irqs_off /* Go to the slow path */
CFI_REMEMBER_STATE
- /*
- * sysretq will re-enable interrupts:
- */
- TRACE_IRQS_ON
- movq RIP-ARGOFFSET(%rsp),%rcx
+
+ RESTORE_C_REGS_EXCEPT_RCX_R11
+ movq RIP(%rsp),%rcx
CFI_REGISTER rip,rcx
- RESTORE_ARGS 1,-ARG_SKIP,0
+ movq EFLAGS(%rsp),%r11
/*CFI_REGISTER rflags,r11*/
- movq PER_CPU_VAR(old_rsp), %rsp
+ movq RSP(%rsp),%rsp
+ /*
+ * 64bit SYSRET restores rip from rcx,
+ * rflags from r11 (but RF and VM bits are forced to 0),
+ * cs and ss are loaded from MSRs.
+ * Restoration of rflags re-enables interrupts.
+ */
USERGS_SYSRET64
CFI_RESTORE_STATE
-int_ret_from_sys_call_fixup:
- FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
- jmp int_ret_from_sys_call_irqs_off
-
- /* Do syscall tracing */
+ /* Do syscall entry tracing */
tracesys:
- leaq -REST_SKIP(%rsp), %rdi
- movq $AUDIT_ARCH_X86_64, %rsi
+ movq %rsp, %rdi
+ movl $AUDIT_ARCH_X86_64, %esi
call syscall_trace_enter_phase1
test %rax, %rax
jnz tracesys_phase2 /* if needed, run the slow path */
- LOAD_ARGS 0 /* else restore clobbered regs */
+ RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
+ movq ORIG_RAX(%rsp), %rax
jmp system_call_fastpath /* and return to the fast path */
tracesys_phase2:
- SAVE_REST
- FIXUP_TOP_OF_STACK %rdi
+ SAVE_EXTRA_REGS
movq %rsp, %rdi
- movq $AUDIT_ARCH_X86_64, %rsi
+ movl $AUDIT_ARCH_X86_64, %esi
movq %rax,%rdx
call syscall_trace_enter_phase2
/*
- * Reload arg registers from stack in case ptrace changed them.
+ * Reload registers from stack in case ptrace changed them.
* We don't reload %rax because syscall_trace_entry_phase2() returned
* the value it wants us to use in the table lookup.
*/
- LOAD_ARGS ARGOFFSET, 1
- RESTORE_REST
+ RESTORE_C_REGS_EXCEPT_RAX
+ RESTORE_EXTRA_REGS
#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max,%rax
#else
andl $__SYSCALL_MASK,%eax
cmpl $__NR_syscall_max,%eax
#endif
- ja int_ret_from_sys_call /* RAX(%rsp) is already set */
+ ja 1f /* return -ENOSYS (already in pt_regs->ax) */
movq %r10,%rcx /* fixup for C */
call *sys_call_table(,%rax,8)
- movq %rax,RAX-ARGOFFSET(%rsp)
- /* Use IRET because user could have changed frame */
+ movq %rax,RAX(%rsp)
+1:
+ /* Use IRET because user could have changed pt_regs->foo */
/*
* Syscall return path ending with IRET.
- * Has correct top of stack, but partial stack frame.
+ * Has correct iret frame.
*/
GLOBAL(int_ret_from_sys_call)
DISABLE_INTERRUPTS(CLBR_NONE)
+int_ret_from_sys_call_irqs_off: /* jumps come here from the irqs-off SYSRET path */
TRACE_IRQS_OFF
-int_ret_from_sys_call_irqs_off:
movl $_TIF_ALLWORK_MASK,%edi
/* edi: mask to check */
GLOBAL(int_with_check)
@@ -450,8 +354,8 @@ GLOBAL(int_with_check)
movl TI_flags(%rcx),%edx
andl %edi,%edx
jnz int_careful
- andl $~TS_COMPAT,TI_status(%rcx)
- jmp retint_swapgs
+ andl $~TS_COMPAT,TI_status(%rcx)
+ jmp syscall_return
/* Either reschedule or signal or syscall exit tracking needed. */
/* First do a reschedule test. */
@@ -468,12 +372,11 @@ int_careful:
TRACE_IRQS_OFF
jmp int_with_check
- /* handle signals and tracing -- both require a full stack frame */
+ /* handle signals and tracing -- both require a full pt_regs */
int_very_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
-int_check_syscall_exit_work:
- SAVE_REST
+ SAVE_EXTRA_REGS
/* Check for syscall exit trace */
testl $_TIF_WORK_SYSCALL_EXIT,%edx
jz int_signal
@@ -492,86 +395,192 @@ int_signal:
call do_notify_resume
1: movl $_TIF_WORK_MASK,%edi
int_restore_rest:
- RESTORE_REST
+ RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
+
+syscall_return:
+ /* The IRETQ could re-enable interrupts: */
+ DISABLE_INTERRUPTS(CLBR_ANY)
+ TRACE_IRQS_IRETQ
+
+ /*
+ * Try to use SYSRET instead of IRET if we're returning to
+ * a completely clean 64-bit userspace context.
+ */
+ movq RCX(%rsp),%rcx
+ cmpq %rcx,RIP(%rsp) /* RCX == RIP */
+ jne opportunistic_sysret_failed
+
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP. It's not worth
+ * testing for canonicalness exactly -- this check detects any
+ * of the 17 high bits set, which is true for non-canonical
+ * or kernel addresses. (This will pessimize vsyscall=native.
+ * Big deal.)
+ *
+ * If virtual addresses ever become wider, this will need
+ * to be updated to remain correct on both old and new CPUs.
+ */
+ .ifne __VIRTUAL_MASK_SHIFT - 47
+ .error "virtual address width changed -- SYSRET checks need update"
+ .endif
+ shr $__VIRTUAL_MASK_SHIFT, %rcx
+ jnz opportunistic_sysret_failed
+
+ cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
+ jne opportunistic_sysret_failed
+
+ movq R11(%rsp),%r11
+ cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */
+ jne opportunistic_sysret_failed
+
+ /*
+ * SYSRET can't restore RF. SYSRET can restore TF, but unlike IRET,
+ * restoring TF results in a trap from userspace immediately after
+ * SYSRET. This would cause an infinite loop whenever #DB happens
+ * with register state that satisfies the opportunistic SYSRET
+ * conditions. For example, single-stepping this user code:
+ *
+ * movq $stuck_here,%rcx
+ * pushfq
+ * popq %r11
+ * stuck_here:
+ *
+ * would never get past 'stuck_here'.
+ */
+ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
+ jnz opportunistic_sysret_failed
+
+ /* nothing to check for RSP */
+
+ cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */
+ jne opportunistic_sysret_failed
+
+ /*
+ * We win! This label is here just for ease of understanding
+ * perf profiles. Nothing jumps here.
+ */
+syscall_return_via_sysret:
+ CFI_REMEMBER_STATE
+ /* r11 is already restored (see code above) */
+ RESTORE_C_REGS_EXCEPT_R11
+ movq RSP(%rsp),%rsp
+ USERGS_SYSRET64
+ CFI_RESTORE_STATE
+
+opportunistic_sysret_failed:
+ SWAPGS
+ jmp restore_c_regs_and_iret
CFI_ENDPROC
END(system_call)
+
.macro FORK_LIKE func
ENTRY(stub_\func)
CFI_STARTPROC
- popq %r11 /* save return address */
- PARTIAL_FRAME 0
- SAVE_REST
- pushq %r11 /* put it back on stack */
- FIXUP_TOP_OF_STACK %r11, 8
- DEFAULT_FRAME 0 8 /* offset 8: return address */
- call sys_\func
- RESTORE_TOP_OF_STACK %r11, 8
- ret $REST_SKIP /* pop extended registers */
+ DEFAULT_FRAME 0, 8 /* offset 8: return address */
+ SAVE_EXTRA_REGS 8
+ jmp sys_\func
CFI_ENDPROC
END(stub_\func)
.endm
- .macro FIXED_FRAME label,func
-ENTRY(\label)
- CFI_STARTPROC
- PARTIAL_FRAME 0 8 /* offset 8: return address */
- FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
- call \func
- RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
- ret
- CFI_ENDPROC
-END(\label)
- .endm
-
FORK_LIKE clone
FORK_LIKE fork
FORK_LIKE vfork
- FIXED_FRAME stub_iopl, sys_iopl
ENTRY(stub_execve)
CFI_STARTPROC
- addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
- FIXUP_TOP_OF_STACK %r11
- call sys_execve
- movq %rax,RAX(%rsp)
- RESTORE_REST
- jmp int_ret_from_sys_call
+ DEFAULT_FRAME 0, 8
+ call sys_execve
+return_from_execve:
+ testl %eax, %eax
+ jz 1f
+ /* exec failed, can use fast SYSRET code path in this case */
+ ret
+1:
+ /* must use IRET code path (pt_regs->cs may have changed) */
+ addq $8, %rsp
+ CFI_ADJUST_CFA_OFFSET -8
+ ZERO_EXTRA_REGS
+ movq %rax,RAX(%rsp)
+ jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_execve)
-
-ENTRY(stub_execveat)
+/*
+ * Remaining execve stubs are only 7 bytes long.
+ * ENTRY() often aligns to 16 bytes, which in this case has no benefits.
+ */
+ .align 8
+GLOBAL(stub_execveat)
CFI_STARTPROC
- addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
- FIXUP_TOP_OF_STACK %r11
- call sys_execveat
- RESTORE_TOP_OF_STACK %r11
- movq %rax,RAX(%rsp)
- RESTORE_REST
- jmp int_ret_from_sys_call
+ DEFAULT_FRAME 0, 8
+ call sys_execveat
+ jmp return_from_execve
CFI_ENDPROC
END(stub_execveat)
+#ifdef CONFIG_X86_X32_ABI
+ .align 8
+GLOBAL(stub_x32_execve)
+ CFI_STARTPROC
+ DEFAULT_FRAME 0, 8
+ call compat_sys_execve
+ jmp return_from_execve
+ CFI_ENDPROC
+END(stub_x32_execve)
+ .align 8
+GLOBAL(stub_x32_execveat)
+ CFI_STARTPROC
+ DEFAULT_FRAME 0, 8
+ call compat_sys_execveat
+ jmp return_from_execve
+ CFI_ENDPROC
+END(stub_x32_execveat)
+#endif
+
+#ifdef CONFIG_IA32_EMULATION
+ .align 8
+GLOBAL(stub32_execve)
+ CFI_STARTPROC
+ call compat_sys_execve
+ jmp return_from_execve
+ CFI_ENDPROC
+END(stub32_execve)
+ .align 8
+GLOBAL(stub32_execveat)
+ CFI_STARTPROC
+ call compat_sys_execveat
+ jmp return_from_execve
+ CFI_ENDPROC
+END(stub32_execveat)
+#endif
+
/*
* sigreturn is special because it needs to restore all registers on return.
* This cannot be done with SYSRET, so use the IRET return path instead.
*/
ENTRY(stub_rt_sigreturn)
CFI_STARTPROC
- addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
- FIXUP_TOP_OF_STACK %r11
+ DEFAULT_FRAME 0, 8
+ /*
+ * SAVE_EXTRA_REGS result is not normally needed:
+ * sigreturn overwrites all pt_regs->GPREGS.
+ * But sigreturn can fail (!), and there is no easy way to detect that.
+ * To make sure RESTORE_EXTRA_REGS doesn't restore garbage on error,
+ * we SAVE_EXTRA_REGS here.
+ */
+ SAVE_EXTRA_REGS 8
call sys_rt_sigreturn
- movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
- RESTORE_REST
+return_from_stub:
+ addq $8, %rsp
+ CFI_ADJUST_CFA_OFFSET -8
+ RESTORE_EXTRA_REGS
+ movq %rax,RAX(%rsp)
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_rt_sigreturn)
@@ -579,86 +588,70 @@ END(stub_rt_sigreturn)
#ifdef CONFIG_X86_X32_ABI
ENTRY(stub_x32_rt_sigreturn)
CFI_STARTPROC
- addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
- FIXUP_TOP_OF_STACK %r11
+ DEFAULT_FRAME 0, 8
+ SAVE_EXTRA_REGS 8
call sys32_x32_rt_sigreturn
- movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
- RESTORE_REST
- jmp int_ret_from_sys_call
+ jmp return_from_stub
CFI_ENDPROC
END(stub_x32_rt_sigreturn)
+#endif
-ENTRY(stub_x32_execve)
- CFI_STARTPROC
- addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
- FIXUP_TOP_OF_STACK %r11
- call compat_sys_execve
- RESTORE_TOP_OF_STACK %r11
- movq %rax,RAX(%rsp)
- RESTORE_REST
- jmp int_ret_from_sys_call
- CFI_ENDPROC
-END(stub_x32_execve)
+/*
+ * A newly forked process directly context switches into this address.
+ *
+ * rdi: prev task we switched from
+ */
+ENTRY(ret_from_fork)
+ DEFAULT_FRAME
-ENTRY(stub_x32_execveat)
- CFI_STARTPROC
- addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
- FIXUP_TOP_OF_STACK %r11
- call compat_sys_execveat
- RESTORE_TOP_OF_STACK %r11
- movq %rax,RAX(%rsp)
- RESTORE_REST
+ LOCK ; btr $TIF_FORK,TI_flags(%r8)
+
+ pushq_cfi $0x0002
+ popfq_cfi # reset kernel eflags
+
+ call schedule_tail # rdi: 'prev' task parameter
+
+ RESTORE_EXTRA_REGS
+
+ testl $3,CS(%rsp) # from kernel_thread?
+
+ /*
+ * By the time we get here, we have no idea whether our pt_regs,
+ * ti flags, and ti status came from the 64-bit SYSCALL fast path,
+ * the slow path, or one of the ia32entry paths.
+ * Use IRET code path to return, since it can safely handle
+ * all of the above.
+ */
+ jnz int_ret_from_sys_call
+
+ /* We came from kernel_thread */
+ /* nb: we depend on RESTORE_EXTRA_REGS above */
+ movq %rbp, %rdi
+ call *%rbx
+ movl $0, RAX(%rsp)
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
-END(stub_x32_execveat)
-
-#endif
+END(ret_from_fork)
/*
- * Build the entry stubs and pointer table with some assembler magic.
- * We pack 7 stubs into a single 32-byte chunk, which will fit in a
- * single cache line on all modern x86 implementations.
+ * Build the entry stubs with some assembler magic.
+ * We pack 1 stub into every 8-byte block.
*/
- .section .init.rodata,"a"
-ENTRY(interrupt)
- .section .entry.text
- .p2align 5
- .p2align CONFIG_X86_L1_CACHE_SHIFT
+ .align 8
ENTRY(irq_entries_start)
INTR_FRAME
-vector=FIRST_EXTERNAL_VECTOR
-.rept (FIRST_SYSTEM_VECTOR-FIRST_EXTERNAL_VECTOR+6)/7
- .balign 32
- .rept 7
- .if vector < FIRST_SYSTEM_VECTOR
- .if vector <> FIRST_EXTERNAL_VECTOR
+ vector=FIRST_EXTERNAL_VECTOR
+ .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR)
+ pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
+ vector=vector+1
+ jmp common_interrupt
CFI_ADJUST_CFA_OFFSET -8
- .endif
-1: pushq_cfi $(~vector+0x80) /* Note: always in signed byte range */
- .if ((vector-FIRST_EXTERNAL_VECTOR)%7) <> 6
- jmp 2f
- .endif
- .previous
- .quad 1b
- .section .entry.text
-vector=vector+1
- .endif
- .endr
-2: jmp common_interrupt
-.endr
+ .align 8
+ .endr
CFI_ENDPROC
END(irq_entries_start)
-.previous
-END(interrupt)
-.previous
-
/*
* Interrupt entry/exit.
*
@@ -669,47 +662,45 @@ END(interrupt)
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
- /* reserve pt_regs for scratch regs and rbp */
- subq $ORIG_RAX-RBP, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
cld
- /* start from rbp in pt_regs and jump over */
- movq_cfi rdi, (RDI-RBP)
- movq_cfi rsi, (RSI-RBP)
- movq_cfi rdx, (RDX-RBP)
- movq_cfi rcx, (RCX-RBP)
- movq_cfi rax, (RAX-RBP)
- movq_cfi r8, (R8-RBP)
- movq_cfi r9, (R9-RBP)
- movq_cfi r10, (R10-RBP)
- movq_cfi r11, (R11-RBP)
-
- /* Save rbp so that we can unwind from get_irq_regs() */
- movq_cfi rbp, 0
-
- /* Save previous stack value */
- movq %rsp, %rsi
+ /*
+ * Since nothing in interrupt handling code touches r12...r15 members
+ * of "struct pt_regs", and since interrupts can nest, we can save
+ * four stack slots and simultaneously provide
+ * an unwind-friendly stack layout by saving "truncated" pt_regs
+ * exactly up to rbp slot, without these members.
+ */
+ ALLOC_PT_GPREGS_ON_STACK -RBP
+ SAVE_C_REGS -RBP
+ /* this goes to 0(%rsp) for unwinder, not for saving the value: */
+ SAVE_EXTRA_REGS_RBP -RBP
+
+ leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */
- leaq -RBP(%rsp),%rdi /* arg1 for handler */
- testl $3, CS-RBP(%rsi)
+ testl $3, CS-RBP(%rsp)
je 1f
SWAPGS
+1:
/*
+ * Save previous stack pointer, optionally switch to interrupt stack.
* irq_count is used to check if a CPU is already on an interrupt stack
* or not. While this is essentially redundant with preempt_count it is
* a little cheaper to use a separate counter in the PDA (short of
* moving irq_enter into assembly, which would be too much work)
*/
-1: incl PER_CPU_VAR(irq_count)
+ movq %rsp, %rsi
+ incl PER_CPU_VAR(irq_count)
cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
CFI_DEF_CFA_REGISTER rsi
-
- /* Store previous stack value */
pushq %rsi
+ /*
+ * For debugger:
+ * "CFA (Current Frame Address) is the value on stack + offset"
+ */
CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
- 0x77 /* DW_OP_breg7 */, 0, \
+ 0x77 /* DW_OP_breg7 (rsp) */, 0, \
0x06 /* DW_OP_deref */, \
- 0x08 /* DW_OP_const1u */, SS+8-RBP, \
+ 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
0x22 /* DW_OP_plus */
/* We entered an interrupt context - irqs are off: */
TRACE_IRQS_OFF
@@ -727,7 +718,7 @@ common_interrupt:
ASM_CLAC
addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
interrupt do_IRQ
- /* 0(%rsp): old_rsp-ARGOFFSET */
+ /* 0(%rsp): old RSP */
ret_from_intr:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
@@ -735,19 +726,18 @@ ret_from_intr:
/* Restore saved previous stack */
popq %rsi
- CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */
- leaq ARGOFFSET-RBP(%rsi), %rsp
+ CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */
+ /* return code expects complete pt_regs - adjust rsp accordingly: */
+ leaq -RBP(%rsi),%rsp
CFI_DEF_CFA_REGISTER rsp
- CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET
+ CFI_ADJUST_CFA_OFFSET RBP
-exit_intr:
- GET_THREAD_INFO(%rcx)
- testl $3,CS-ARGOFFSET(%rsp)
+ testl $3,CS(%rsp)
je retint_kernel
-
/* Interrupt came from user space */
+
+ GET_THREAD_INFO(%rcx)
/*
- * Has a correct top of stack, but a partial stack frame
* %rcx: thread info. Interrupts off.
*/
retint_with_reschedule:
@@ -766,70 +756,34 @@ retint_swapgs: /* return to user-space */
DISABLE_INTERRUPTS(CLBR_ANY)
TRACE_IRQS_IRETQ
- /*
- * Try to use SYSRET instead of IRET if we're returning to
- * a completely clean 64-bit userspace context.
- */
- movq (RCX-R11)(%rsp), %rcx
- cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */
- jne opportunistic_sysret_failed
-
- /*
- * On Intel CPUs, sysret with non-canonical RCX/RIP will #GP
- * in kernel space. This essentially lets the user take over
- * the kernel, since userspace controls RSP. It's not worth
- * testing for canonicalness exactly -- this check detects any
- * of the 17 high bits set, which is true for non-canonical
- * or kernel addresses. (This will pessimize vsyscall=native.
- * Big deal.)
- *
- * If virtual addresses ever become wider, this will need
- * to be updated to remain correct on both old and new CPUs.
- */
- .ifne __VIRTUAL_MASK_SHIFT - 47
- .error "virtual address width changed -- sysret checks need update"
- .endif
- shr $__VIRTUAL_MASK_SHIFT, %rcx
- jnz opportunistic_sysret_failed
-
- cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */
- jne opportunistic_sysret_failed
-
- movq (R11-ARGOFFSET)(%rsp), %r11
- cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */
- jne opportunistic_sysret_failed
-
- testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */
- jnz opportunistic_sysret_failed
-
- /* nothing to check for RSP */
-
- cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */
- jne opportunistic_sysret_failed
-
- /*
- * We win! This label is here just for ease of understanding
- * perf profiles. Nothing jumps here.
- */
-irq_return_via_sysret:
- CFI_REMEMBER_STATE
- RESTORE_ARGS 1,8,1
- movq (RSP-RIP)(%rsp),%rsp
- USERGS_SYSRET64
- CFI_RESTORE_STATE
-
-opportunistic_sysret_failed:
SWAPGS
- jmp restore_args
+ jmp restore_c_regs_and_iret
-retint_restore_args: /* return to kernel space */
- DISABLE_INTERRUPTS(CLBR_ANY)
+/* Returning to kernel space */
+retint_kernel:
+#ifdef CONFIG_PREEMPT
+ /* Interrupts are off */
+ /* Check if we need preemption */
+ bt $9,EFLAGS(%rsp) /* interrupts were off? */
+ jnc 1f
+0: cmpl $0,PER_CPU_VAR(__preempt_count)
+ jnz 1f
+ call preempt_schedule_irq
+ jmp 0b
+1:
+#endif
/*
* The iretq could re-enable interrupts:
*/
TRACE_IRQS_IRETQ
-restore_args:
- RESTORE_ARGS 1,8,1
+
+/*
+ * At this label, code paths which return to kernel and to user,
+ * which come from interrupts/exception and from syscalls, merge.
+ */
+restore_c_regs_and_iret:
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
irq_return:
INTERRUPT_RETURN
@@ -900,28 +854,17 @@ retint_signal:
jz retint_swapgs
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_REST
+ SAVE_EXTRA_REGS
movq $-1,ORIG_RAX(%rsp)
xorl %esi,%esi # oldset
movq %rsp,%rdi # &pt_regs
call do_notify_resume
- RESTORE_REST
+ RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx)
jmp retint_with_reschedule
-#ifdef CONFIG_PREEMPT
- /* Returning to kernel space. Check if we need preemption */
- /* rcx: threadinfo. interrupts off. */
-ENTRY(retint_kernel)
- cmpl $0,PER_CPU_VAR(__preempt_count)
- jnz retint_restore_args
- bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
- jnc retint_restore_args
- call preempt_schedule_irq
- jmp exit_intr
-#endif
CFI_ENDPROC
END(common_interrupt)
@@ -1010,7 +953,7 @@ apicinterrupt IRQ_WORK_VECTOR \
/*
* Exception entry points.
*/
-#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
@@ -1032,8 +975,7 @@ ENTRY(\sym)
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
.endif
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+ ALLOC_PT_GPREGS_ON_STACK
.if \paranoid
.if \paranoid == 1
@@ -1041,10 +983,11 @@ ENTRY(\sym)
testl $3, CS(%rsp) /* If coming from userspace, switch */
jnz 1f /* stacks. */
.endif
- call save_paranoid
+ call paranoid_entry
.else
call error_entry
.endif
+ /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
DEFAULT_FRAME 0
@@ -1066,19 +1009,20 @@ ENTRY(\sym)
.endif
.if \shift_ist != -1
- subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+ subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
.endif
call \do_sym
.if \shift_ist != -1
- addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+ addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
.endif
+ /* these procedures expect "no swapgs" flag in ebx */
.if \paranoid
- jmp paranoid_exit /* %ebx: no swapgs flag */
+ jmp paranoid_exit
.else
- jmp error_exit /* %ebx: no swapgs flag */
+ jmp error_exit
.endif
.if \paranoid == 1
@@ -1282,7 +1226,9 @@ ENTRY(xen_failsafe_callback)
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
pushq_cfi $-1 /* orig_ax = -1 => not a system call */
- SAVE_ALL
+ ALLOC_PT_GPREGS_ON_STACK
+ SAVE_C_REGS
+ SAVE_EXTRA_REGS
jmp error_exit
CFI_ENDPROC
END(xen_failsafe_callback)
@@ -1314,59 +1260,66 @@ idtentry async_page_fault do_async_page_fault has_error_code=1
idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
#endif
- /*
- * "Paranoid" exit path from exception stack. This is invoked
- * only on return from non-NMI IST interrupts that came
- * from kernel space.
- *
- * We may be returning to very strange contexts (e.g. very early
- * in syscall entry), so checking for preemption here would
- * be complicated. Fortunately, we there's no good reason
- * to try to handle preemption here.
- */
+/*
+ * Save all registers in pt_regs, and switch gs if needed.
+ * Use slow, but surefire "are we in kernel?" check.
+ * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ */
+ENTRY(paranoid_entry)
+ XCPT_FRAME 1 15*8
+ cld
+ SAVE_C_REGS 8
+ SAVE_EXTRA_REGS 8
+ movl $1,%ebx
+ movl $MSR_GS_BASE,%ecx
+ rdmsr
+ testl %edx,%edx
+ js 1f /* negative -> in kernel */
+ SWAPGS
+ xorl %ebx,%ebx
+1: ret
+ CFI_ENDPROC
+END(paranoid_entry)
- /* ebx: no swapgs flag */
+/*
+ * "Paranoid" exit path from exception stack. This is invoked
+ * only on return from non-NMI IST interrupts that came
+ * from kernel space.
+ *
+ * We may be returning to very strange contexts (e.g. very early
+ * in syscall entry), so checking for preemption here would
+ * be complicated. Fortunately, we there's no good reason
+ * to try to handle preemption here.
+ */
+/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
ENTRY(paranoid_exit)
DEFAULT_FRAME
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF_DEBUG
testl %ebx,%ebx /* swapgs needed? */
- jnz paranoid_restore
- TRACE_IRQS_IRETQ 0
+ jnz paranoid_exit_no_swapgs
+ TRACE_IRQS_IRETQ
SWAPGS_UNSAFE_STACK
- RESTORE_ALL 8
- INTERRUPT_RETURN
-paranoid_restore:
- TRACE_IRQS_IRETQ_DEBUG 0
- RESTORE_ALL 8
+ jmp paranoid_exit_restore
+paranoid_exit_no_swapgs:
+ TRACE_IRQS_IRETQ_DEBUG
+paranoid_exit_restore:
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
INTERRUPT_RETURN
CFI_ENDPROC
END(paranoid_exit)
/*
- * Exception entry point. This expects an error code/orig_rax on the stack.
- * returns in "no swapgs flag" in %ebx.
+ * Save all registers in pt_regs, and switch gs if needed.
+ * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
*/
ENTRY(error_entry)
- XCPT_FRAME
- CFI_ADJUST_CFA_OFFSET 15*8
- /* oldrax contains error code */
+ XCPT_FRAME 1 15*8
cld
- movq %rdi, RDI+8(%rsp)
- movq %rsi, RSI+8(%rsp)
- movq %rdx, RDX+8(%rsp)
- movq %rcx, RCX+8(%rsp)
- movq %rax, RAX+8(%rsp)
- movq %r8, R8+8(%rsp)
- movq %r9, R9+8(%rsp)
- movq %r10, R10+8(%rsp)
- movq %r11, R11+8(%rsp)
- movq_cfi rbx, RBX+8
- movq %rbp, RBP+8(%rsp)
- movq %r12, R12+8(%rsp)
- movq %r13, R13+8(%rsp)
- movq %r14, R14+8(%rsp)
- movq %r15, R15+8(%rsp)
+ SAVE_C_REGS 8
+ SAVE_EXTRA_REGS 8
xorl %ebx,%ebx
testl $3,CS+8(%rsp)
je error_kernelspace
@@ -1376,12 +1329,12 @@ error_sti:
TRACE_IRQS_OFF
ret
-/*
- * There are two places in the kernel that can potentially fault with
- * usergs. Handle them here. B stepping K8s sometimes report a
- * truncated RIP for IRET exceptions returning to compat mode. Check
- * for these here too.
- */
+ /*
+ * There are two places in the kernel that can potentially fault with
+ * usergs. Handle them here. B stepping K8s sometimes report a
+ * truncated RIP for IRET exceptions returning to compat mode. Check
+ * for these here too.
+ */
error_kernelspace:
CFI_REL_OFFSET rcx, RCX+8
incl %ebx
@@ -1411,11 +1364,11 @@ error_bad_iret:
END(error_entry)
-/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
+/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
ENTRY(error_exit)
DEFAULT_FRAME
movl %ebx,%eax
- RESTORE_REST
+ RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx)
@@ -1430,19 +1383,7 @@ ENTRY(error_exit)
CFI_ENDPROC
END(error_exit)
-/*
- * Test if a given stack is an NMI stack or not.
- */
- .macro test_in_nmi reg stack nmi_ret normal_ret
- cmpq %\reg, \stack
- ja \normal_ret
- subq $EXCEPTION_STKSZ, %\reg
- cmpq %\reg, \stack
- jb \normal_ret
- jmp \nmi_ret
- .endm
-
- /* runs on exception stack */
+/* Runs on exception stack */
ENTRY(nmi)
INTR_FRAME
PARAVIRT_ADJUST_EXCEPTION_FRAME
@@ -1478,7 +1419,7 @@ ENTRY(nmi)
* NMI.
*/
- /* Use %rdx as out temp variable throughout */
+ /* Use %rdx as our temp variable throughout */
pushq_cfi %rdx
CFI_REL_OFFSET rdx, 0
@@ -1503,8 +1444,17 @@ ENTRY(nmi)
* We check the variable because the first NMI could be in a
* breakpoint routine using a breakpoint stack.
*/
- lea 6*8(%rsp), %rdx
- test_in_nmi rdx, 4*8(%rsp), nested_nmi, first_nmi
+ lea 6*8(%rsp), %rdx
+ /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) */
+ cmpq %rdx, 4*8(%rsp)
+ /* If the stack pointer is above the NMI stack, this is a normal NMI */
+ ja first_nmi
+ subq $EXCEPTION_STKSZ, %rdx
+ cmpq %rdx, 4*8(%rsp)
+ /* If it is below the NMI stack, it is a normal NMI */
+ jb first_nmi
+ /* Ah, it is within the NMI stack, treat it as nested */
+
CFI_REMEMBER_STATE
nested_nmi:
@@ -1597,7 +1547,7 @@ first_nmi:
.rept 5
pushq_cfi 11*8(%rsp)
.endr
- CFI_DEF_CFA_OFFSET SS+8-RIP
+ CFI_DEF_CFA_OFFSET 5*8
/* Everything up to here is safe from nested NMIs */
@@ -1625,7 +1575,7 @@ repeat_nmi:
pushq_cfi -6*8(%rsp)
.endr
subq $(5*8), %rsp
- CFI_DEF_CFA_OFFSET SS+8-RIP
+ CFI_DEF_CFA_OFFSET 5*8
end_repeat_nmi:
/*
@@ -1634,16 +1584,16 @@ end_repeat_nmi:
* so that we repeat another NMI.
*/
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+ ALLOC_PT_GPREGS_ON_STACK
+
/*
- * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
+ * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
* as we should not be calling schedule in NMI context.
* Even with normal interrupts enabled. An NMI should not be
* setting NEED_RESCHED or anything that normal interrupts and
* exceptions might do.
*/
- call save_paranoid
+ call paranoid_entry
DEFAULT_FRAME 0
/*
@@ -1674,8 +1624,10 @@ end_repeat_nmi:
nmi_swapgs:
SWAPGS_UNSAFE_STACK
nmi_restore:
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
/* Pop the extra iret frame at once */
- RESTORE_ALL 6*8
+ REMOVE_PT_GPREGS_FROM_STACK 6*8
/* Clear the NMI executing stack variable */
movq $0, 5*8(%rsp)
diff --git a/arch/x86/kernel/head64.c b/arch/x86/kernel/head64.c
index c4f8d46..2b55ee6 100644
--- a/arch/x86/kernel/head64.c
+++ b/arch/x86/kernel/head64.c
@@ -177,9 +177,6 @@ asmlinkage __visible void __init x86_64_start_kernel(char * real_mode_data)
*/
load_ucode_bsp();
- if (console_loglevel >= CONSOLE_LOGLEVEL_DEBUG)
- early_printk("Kernel alive\n");
-
clear_page(init_level4_pgt);
/* set init_level4_pgt kernel high mapping*/
init_level4_pgt[511] = early_level4_pgt[511];
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f36bd42..d031bad 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -22,6 +22,7 @@
#include <asm/cpufeature.h>
#include <asm/percpu.h>
#include <asm/nops.h>
+#include <asm/bootparam.h>
/* Physical address */
#define pa(X) ((X) - __PAGE_OFFSET)
@@ -90,7 +91,7 @@ ENTRY(startup_32)
/* test KEEP_SEGMENTS flag to see if the bootloader is asking
us to not reload segments */
- testb $(1<<6), BP_loadflags(%esi)
+ testb $KEEP_SEGMENTS, BP_loadflags(%esi)
jnz 2f
/*
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6fd514d9..ae6588b 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -1,5 +1,5 @@
/*
- * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
+ * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit
*
* Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
* Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
@@ -56,7 +56,7 @@ startup_64:
* %rsi holds a physical pointer to real_mode_data.
*
* We come here either directly from a 64bit bootloader, or from
- * arch/x86_64/boot/compressed/head.S.
+ * arch/x86/boot/compressed/head_64.S.
*
* We only come here initially at boot nothing else comes here.
*
@@ -146,7 +146,7 @@ startup_64:
leaq level2_kernel_pgt(%rip), %rdi
leaq 4096(%rdi), %r8
/* See if it is a valid page table entry */
-1: testq $1, 0(%rdi)
+1: testb $1, 0(%rdi)
jz 2f
addq %rbp, 0(%rdi)
/* Go to the next page */
diff --git a/arch/x86/kernel/i387.c b/arch/x86/kernel/i387.c
index d5651fc..367f39d 100644
--- a/arch/x86/kernel/i387.c
+++ b/arch/x86/kernel/i387.c
@@ -42,8 +42,8 @@ void kernel_fpu_enable(void)
* be set (so that the clts/stts pair does nothing that is
* visible in the interrupted kernel thread).
*
- * Except for the eagerfpu case when we return 1 unless we've already
- * been eager and saved the state in kernel_fpu_begin().
+ * Except for the eagerfpu case when we return true; in the likely case
+ * the thread has FPU but we are not going to set/clear TS.
*/
static inline bool interrupted_kernel_fpu_idle(void)
{
@@ -51,7 +51,7 @@ static inline bool interrupted_kernel_fpu_idle(void)
return false;
if (use_eager_fpu())
- return __thread_has_fpu(current);
+ return true;
return !__thread_has_fpu(current) &&
(read_cr0() & X86_CR0_TS);
@@ -68,7 +68,7 @@ static inline bool interrupted_kernel_fpu_idle(void)
static inline bool interrupted_user_mode(void)
{
struct pt_regs *regs = get_irq_regs();
- return regs && user_mode_vm(regs);
+ return regs && user_mode(regs);
}
/*
@@ -94,9 +94,10 @@ void __kernel_fpu_begin(void)
if (__thread_has_fpu(me)) {
__save_init_fpu(me);
- } else if (!use_eager_fpu()) {
+ } else {
this_cpu_write(fpu_owner_task, NULL);
- clts();
+ if (!use_eager_fpu())
+ clts();
}
}
EXPORT_SYMBOL(__kernel_fpu_begin);
@@ -107,7 +108,7 @@ void __kernel_fpu_end(void)
if (__thread_has_fpu(me)) {
if (WARN_ON(restore_fpu_checking(me)))
- drop_init_fpu(me);
+ fpu_reset_state(me);
} else if (!use_eager_fpu()) {
stts();
}
@@ -120,10 +121,13 @@ void unlazy_fpu(struct task_struct *tsk)
{
preempt_disable();
if (__thread_has_fpu(tsk)) {
- __save_init_fpu(tsk);
- __thread_fpu_end(tsk);
- } else
- tsk->thread.fpu_counter = 0;
+ if (use_eager_fpu()) {
+ __save_fpu(tsk);
+ } else {
+ __save_init_fpu(tsk);
+ __thread_fpu_end(tsk);
+ }
+ }
preempt_enable();
}
EXPORT_SYMBOL(unlazy_fpu);
@@ -221,11 +225,12 @@ void fpu_finit(struct fpu *fpu)
return;
}
+ memset(fpu->state, 0, xstate_size);
+
if (cpu_has_fxsr) {
fx_finit(&fpu->state->fxsave);
} else {
struct i387_fsave_struct *fp = &fpu->state->fsave;
- memset(fp, 0, xstate_size);
fp->cwd = 0xffff037fu;
fp->swd = 0xffff0000u;
fp->twd = 0xffffffffu;
@@ -247,7 +252,7 @@ int init_fpu(struct task_struct *tsk)
if (tsk_used_math(tsk)) {
if (cpu_has_fpu && tsk == current)
unlazy_fpu(tsk);
- tsk->thread.fpu.last_cpu = ~0;
+ task_disable_lazy_fpu_restore(tsk);
return 0;
}
@@ -336,6 +341,7 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
unsigned int pos, unsigned int count,
void *kbuf, void __user *ubuf)
{
+ struct xsave_struct *xsave = &target->thread.fpu.state->xsave;
int ret;
if (!cpu_has_xsave)
@@ -350,14 +356,12 @@ int xstateregs_get(struct task_struct *target, const struct user_regset *regset,
* memory layout in the thread struct, so that we can copy the entire
* xstateregs to the user using one user_regset_copyout().
*/
- memcpy(&target->thread.fpu.state->fxsave.sw_reserved,
- xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
-
+ memcpy(&xsave->i387.sw_reserved,
+ xstate_fx_sw_bytes, sizeof(xstate_fx_sw_bytes));
/*
* Copy the xstate memory layout.
*/
- ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf,
- &target->thread.fpu.state->xsave, 0, -1);
+ ret = user_regset_copyout(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
return ret;
}
@@ -365,8 +369,8 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
unsigned int pos, unsigned int count,
const void *kbuf, const void __user *ubuf)
{
+ struct xsave_struct *xsave = &target->thread.fpu.state->xsave;
int ret;
- struct xsave_hdr_struct *xsave_hdr;
if (!cpu_has_xsave)
return -ENODEV;
@@ -375,22 +379,16 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset,
if (ret)
return ret;
- ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
- &target->thread.fpu.state->xsave, 0, -1);
-
+ ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1);
/*
* mxcsr reserved bits must be masked to zero for security reasons.
*/
- target->thread.fpu.state->fxsave.mxcsr &= mxcsr_feature_mask;
-
- xsave_hdr = &target->thread.fpu.state->xsave.xsave_hdr;
-
- xsave_hdr->xstate_bv &= pcntxt_mask;
+ xsave->i387.mxcsr &= mxcsr_feature_mask;
+ xsave->xsave_hdr.xstate_bv &= pcntxt_mask;
/*
* These bits must be zero.
*/
- memset(xsave_hdr->reserved, 0, 48);
-
+ memset(&xsave->xsave_hdr.reserved, 0, 48);
return ret;
}
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 4ddaf66..37dae79 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -54,7 +54,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
* because the ->io_bitmap_max value must match the bitmap
* contents:
*/
- tss = &per_cpu(init_tss, get_cpu());
+ tss = &per_cpu(cpu_tss, get_cpu());
if (turn_on)
bitmap_clear(t->io_bitmap_ptr, from, num);
diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c
index 67b1cbe..e5952c2 100644
--- a/arch/x86/kernel/irq.c
+++ b/arch/x86/kernel/irq.c
@@ -295,7 +295,7 @@ int check_irq_vectors_for_cpu_disable(void)
this_cpu = smp_processor_id();
cpumask_copy(&online_new, cpu_online_mask);
- cpu_clear(this_cpu, online_new);
+ cpumask_clear_cpu(this_cpu, &online_new);
this_count = 0;
for (vector = FIRST_EXTERNAL_VECTOR; vector < NR_VECTORS; vector++) {
@@ -307,7 +307,7 @@ int check_irq_vectors_for_cpu_disable(void)
data = irq_desc_get_irq_data(desc);
cpumask_copy(&affinity_new, data->affinity);
- cpu_clear(this_cpu, affinity_new);
+ cpumask_clear_cpu(this_cpu, &affinity_new);
/* Do not count inactive or per-cpu irqs. */
if (!irq_has_action(irq) || irqd_is_per_cpu(data))
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 28d28f5..f9fd86a 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -165,7 +165,7 @@ bool handle_irq(unsigned irq, struct pt_regs *regs)
if (unlikely(!desc))
return false;
- if (user_mode_vm(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
+ if (user_mode(regs) || !execute_on_irq_stack(overflow, desc, irq)) {
if (unlikely(overflow))
print_stack_overflow();
desc->handle_irq(irq, desc);
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index e4b503d..394e643 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -44,7 +44,7 @@ static inline void stack_overflow_check(struct pt_regs *regs)
u64 estack_top, estack_bottom;
u64 curbase = (u64)task_stack_page(current);
- if (user_mode_vm(regs))
+ if (user_mode(regs))
return;
if (regs->sp >= curbase + sizeof(struct thread_info) +
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index 70e181e..cd10a64 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -178,7 +178,8 @@ void __init native_init_IRQ(void)
#endif
for_each_clear_bit_from(i, used_vectors, first_system_vector) {
/* IA32_SYSCALL_VECTOR could be used in trap_init already. */
- set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]);
+ set_intr_gate(i, irq_entries_start +
+ 8 * (i - FIRST_EXTERNAL_VECTOR));
}
#ifdef CONFIG_X86_LOCAL_APIC
for_each_clear_bit_from(i, used_vectors, NR_VECTORS)
diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c
index 7ec1d5f..d6178d9 100644
--- a/arch/x86/kernel/kgdb.c
+++ b/arch/x86/kernel/kgdb.c
@@ -72,7 +72,7 @@ struct dbg_reg_def_t dbg_reg_def[DBG_MAX_REG_NUM] =
{ "bx", 8, offsetof(struct pt_regs, bx) },
{ "cx", 8, offsetof(struct pt_regs, cx) },
{ "dx", 8, offsetof(struct pt_regs, dx) },
- { "si", 8, offsetof(struct pt_regs, dx) },
+ { "si", 8, offsetof(struct pt_regs, si) },
{ "di", 8, offsetof(struct pt_regs, di) },
{ "bp", 8, offsetof(struct pt_regs, bp) },
{ "sp", 8, offsetof(struct pt_regs, sp) },
@@ -126,11 +126,11 @@ char *dbg_get_reg(int regno, void *mem, struct pt_regs *regs)
#ifdef CONFIG_X86_32
switch (regno) {
case GDB_SS:
- if (!user_mode_vm(regs))
+ if (!user_mode(regs))
*(unsigned long *)mem = __KERNEL_DS;
break;
case GDB_SP:
- if (!user_mode_vm(regs))
+ if (!user_mode(regs))
*(unsigned long *)mem = kernel_stack_pointer(regs);
break;
case GDB_GS:
diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c
index 03189d8..1deffe6 100644
--- a/arch/x86/kernel/kprobes/core.c
+++ b/arch/x86/kernel/kprobes/core.c
@@ -605,7 +605,7 @@ int kprobe_int3_handler(struct pt_regs *regs)
struct kprobe *p;
struct kprobe_ctlblk *kcb;
- if (user_mode_vm(regs))
+ if (user_mode(regs))
return 0;
addr = (kprobe_opcode_t *)(regs->ip - sizeof(kprobe_opcode_t));
@@ -1010,7 +1010,7 @@ int kprobe_exceptions_notify(struct notifier_block *self, unsigned long val,
struct die_args *args = data;
int ret = NOTIFY_DONE;
- if (args->regs && user_mode_vm(args->regs))
+ if (args->regs && user_mode(args->regs))
return ret;
if (val == DIE_GPF) {
diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c
index d1ac80b..005c03e 100644
--- a/arch/x86/kernel/module.c
+++ b/arch/x86/kernel/module.c
@@ -33,6 +33,7 @@
#include <asm/page.h>
#include <asm/pgtable.h>
+#include <asm/setup.h>
#if 0
#define DEBUGP(fmt, ...) \
@@ -47,21 +48,13 @@ do { \
#ifdef CONFIG_RANDOMIZE_BASE
static unsigned long module_load_offset;
-static int randomize_modules = 1;
/* Mutex protects the module_load_offset. */
static DEFINE_MUTEX(module_kaslr_mutex);
-static int __init parse_nokaslr(char *p)
-{
- randomize_modules = 0;
- return 0;
-}
-early_param("nokaslr", parse_nokaslr);
-
static unsigned long int get_module_load_offset(void)
{
- if (randomize_modules) {
+ if (kaslr_enabled()) {
mutex_lock(&module_kaslr_mutex);
/*
* Calculate the module_load_offset the first time this
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index 781861c..da8cb98 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -131,10 +131,11 @@ void perf_get_regs_user(struct perf_regs *regs_user,
}
/*
- * RIP, flags, and the argument registers are usually saved.
- * orig_ax is probably okay, too.
+ * These registers are always saved on 64-bit syscall entry.
+ * On 32-bit entry points, they are saved too except r8..r11.
*/
regs_user_copy->ip = user_regs->ip;
+ regs_user_copy->ax = user_regs->ax;
regs_user_copy->cx = user_regs->cx;
regs_user_copy->dx = user_regs->dx;
regs_user_copy->si = user_regs->si;
@@ -145,9 +146,12 @@ void perf_get_regs_user(struct perf_regs *regs_user,
regs_user_copy->r11 = user_regs->r11;
regs_user_copy->orig_ax = user_regs->orig_ax;
regs_user_copy->flags = user_regs->flags;
+ regs_user_copy->sp = user_regs->sp;
+ regs_user_copy->cs = user_regs->cs;
+ regs_user_copy->ss = user_regs->ss;
/*
- * Don't even try to report the "rest" regs.
+ * Most system calls don't save these registers, don't report them.
*/
regs_user_copy->bx = -1;
regs_user_copy->bp = -1;
@@ -158,37 +162,13 @@ void perf_get_regs_user(struct perf_regs *regs_user,
/*
* For this to be at all useful, we need a reasonable guess for
- * sp and the ABI. Be careful: we're in NMI context, and we're
+ * the ABI. Be careful: we're in NMI context, and we're
* considering current to be the current task, so we should
* be careful not to look at any other percpu variables that might
* change during context switches.
*/
- if (IS_ENABLED(CONFIG_IA32_EMULATION) &&
- task_thread_info(current)->status & TS_COMPAT) {
- /* Easy case: we're in a compat syscall. */
- regs_user->abi = PERF_SAMPLE_REGS_ABI_32;
- regs_user_copy->sp = user_regs->sp;
- regs_user_copy->cs = user_regs->cs;
- regs_user_copy->ss = user_regs->ss;
- } else if (user_regs->orig_ax != -1) {
- /*
- * We're probably in a 64-bit syscall.
- * Warning: this code is severely racy. At least it's better
- * than just blindly copying user_regs.
- */
- regs_user->abi = PERF_SAMPLE_REGS_ABI_64;
- regs_user_copy->sp = this_cpu_read(old_rsp);
- regs_user_copy->cs = __USER_CS;
- regs_user_copy->ss = __USER_DS;
- regs_user_copy->cx = -1; /* usually contains garbage */
- } else {
- /* We're probably in an interrupt or exception. */
- regs_user->abi = user_64bit_mode(user_regs) ?
- PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
- regs_user_copy->sp = user_regs->sp;
- regs_user_copy->cs = user_regs->cs;
- regs_user_copy->ss = user_regs->ss;
- }
+ regs_user->abi = user_64bit_mode(user_regs) ?
+ PERF_SAMPLE_REGS_ABI_64 : PERF_SAMPLE_REGS_ABI_32;
regs_user->regs = regs_user_copy;
}
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 046e2d6..8213da6 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,7 @@
#include <linux/sched.h>
#include <linux/module.h>
#include <linux/pm.h>
-#include <linux/clockchips.h>
+#include <linux/tick.h>
#include <linux/random.h>
#include <linux/user-return-notifier.h>
#include <linux/dmi.h>
@@ -24,6 +24,7 @@
#include <asm/syscalls.h>
#include <asm/idle.h>
#include <asm/uaccess.h>
+#include <asm/mwait.h>
#include <asm/i387.h>
#include <asm/fpu-internal.h>
#include <asm/debugreg.h>
@@ -37,7 +38,26 @@
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+ .x86_tss = {
+ .sp0 = TOP_OF_INIT_STACK,
+#ifdef CONFIG_X86_32
+ .ss0 = __KERNEL_DS,
+ .ss1 = __KERNEL_CS,
+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+#endif
+ },
+#ifdef CONFIG_X86_32
+ /*
+ * Note that the .io_bitmap member must be extra-big. This is because
+ * the CPU will access an additional byte beyond the end of the IO
+ * permission bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
+#endif
+};
+EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss);
#ifdef CONFIG_X86_64
static DEFINE_PER_CPU(unsigned char, is_idle);
@@ -69,8 +89,8 @@ int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
dst->thread.fpu_counter = 0;
dst->thread.fpu.has_fpu = 0;
- dst->thread.fpu.last_cpu = ~0;
dst->thread.fpu.state = NULL;
+ task_disable_lazy_fpu_restore(dst);
if (tsk_used_math(src)) {
int err = fpu_alloc(&dst->thread.fpu);
if (err)
@@ -109,7 +129,7 @@ void exit_thread(void)
unsigned long *bp = t->io_bitmap_ptr;
if (bp) {
- struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+ struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
t->io_bitmap_ptr = NULL;
clear_thread_flag(TIF_IO_BITMAP);
@@ -131,13 +151,18 @@ void flush_thread(void)
flush_ptrace_hw_breakpoint(tsk);
memset(tsk->thread.tls_array, 0, sizeof(tsk->thread.tls_array));
- drop_init_fpu(tsk);
- /*
- * Free the FPU state for non xsave platforms. They get reallocated
- * lazily at the first use.
- */
- if (!use_eager_fpu())
+
+ if (!use_eager_fpu()) {
+ /* FPU state will be reallocated lazily at the first use. */
+ drop_fpu(tsk);
free_thread_xstate(tsk);
+ } else if (!used_math()) {
+ /* kthread execs. TODO: cleanup this horror. */
+ if (WARN_ON(init_fpu(tsk)))
+ force_sig(SIGKILL, tsk);
+ user_fpu_begin();
+ restore_init_xstate();
+ }
}
static void hard_disable_TSC(void)
@@ -377,14 +402,11 @@ static void amd_e400_idle(void)
if (!cpumask_test_cpu(cpu, amd_e400_c1e_mask)) {
cpumask_set_cpu(cpu, amd_e400_c1e_mask);
- /*
- * Force broadcast so ACPI can not interfere.
- */
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_FORCE,
- &cpu);
+ /* Force broadcast so ACPI can not interfere. */
+ tick_broadcast_force();
pr_info("Switch to broadcast mode on CPU%d\n", cpu);
}
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_ENTER, &cpu);
+ tick_broadcast_enter();
default_idle();
@@ -393,12 +415,59 @@ static void amd_e400_idle(void)
* called with interrupts disabled.
*/
local_irq_disable();
- clockevents_notify(CLOCK_EVT_NOTIFY_BROADCAST_EXIT, &cpu);
+ tick_broadcast_exit();
local_irq_enable();
} else
default_idle();
}
+/*
+ * Intel Core2 and older machines prefer MWAIT over HALT for C1.
+ * We can't rely on cpuidle installing MWAIT, because it will not load
+ * on systems that support only C1 -- so the boot default must be MWAIT.
+ *
+ * Some AMD machines are the opposite, they depend on using HALT.
+ *
+ * So for default C1, which is used during boot until cpuidle loads,
+ * use MWAIT-C1 on Intel HW that has it, else use HALT.
+ */
+static int prefer_mwait_c1_over_halt(const struct cpuinfo_x86 *c)
+{
+ if (c->x86_vendor != X86_VENDOR_INTEL)
+ return 0;
+
+ if (!cpu_has(c, X86_FEATURE_MWAIT))
+ return 0;
+
+ return 1;
+}
+
+/*
+ * MONITOR/MWAIT with no hints, used for default default C1 state.
+ * This invokes MWAIT with interrutps enabled and no flags,
+ * which is backwards compatible with the original MWAIT implementation.
+ */
+
+static void mwait_idle(void)
+{
+ if (!current_set_polling_and_test()) {
+ if (this_cpu_has(X86_BUG_CLFLUSH_MONITOR)) {
+ smp_mb(); /* quirk */
+ clflush((void *)&current_thread_info()->flags);
+ smp_mb(); /* quirk */
+ }
+
+ __monitor((void *)&current_thread_info()->flags, 0, 0);
+ if (!need_resched())
+ __sti_mwait(0, 0);
+ else
+ local_irq_enable();
+ } else {
+ local_irq_enable();
+ }
+ __current_clr_polling();
+}
+
void select_idle_routine(const struct cpuinfo_x86 *c)
{
#ifdef CONFIG_SMP
@@ -412,6 +481,9 @@ void select_idle_routine(const struct cpuinfo_x86 *c)
/* E400: APIC timer interrupt does not wake up CPU from C1e */
pr_info("using AMD E400 aware idle routine\n");
x86_idle = amd_e400_idle;
+ } else if (prefer_mwait_c1_over_halt(c)) {
+ pr_info("using mwait in idle threads\n");
+ x86_idle = mwait_idle;
} else
x86_idle = default_idle;
}
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 603c4f9..8ed2106 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -73,7 +73,7 @@ void __show_regs(struct pt_regs *regs, int all)
unsigned long sp;
unsigned short ss, gs;
- if (user_mode_vm(regs)) {
+ if (user_mode(regs)) {
sp = regs->sp;
ss = regs->ss & 0xffff;
gs = get_user_gs(regs);
@@ -206,11 +206,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
regs->ip = new_ip;
regs->sp = new_sp;
regs->flags = X86_EFLAGS_IF;
- /*
- * force it to the iret return path by making it look as if there was
- * some work pending.
- */
- set_thread_flag(TIF_NOTIFY_RESUME);
+ force_iret();
}
EXPORT_SYMBOL_GPL(start_thread);
@@ -248,7 +244,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
+ struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
fpu_switch_t fpu;
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
@@ -256,11 +252,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
fpu = switch_fpu_prepare(prev_p, next_p, cpu);
/*
- * Reload esp0.
- */
- load_sp0(tss, next);
-
- /*
* Save away %gs. No need to save %fs, as it was saved on the
* stack on entry. No need to save %es and %ds, as those are
* always kernel segments while inside the kernel. Doing this
@@ -310,9 +301,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
arch_end_context_switch(next_p);
+ /*
+ * Reload esp0, kernel_stack, and current_top_of_stack. This changes
+ * current_thread_info().
+ */
+ load_sp0(tss, next);
this_cpu_write(kernel_stack,
- (unsigned long)task_stack_page(next_p) +
- THREAD_SIZE - KERNEL_STACK_OFFSET);
+ (unsigned long)task_stack_page(next_p) +
+ THREAD_SIZE);
+ this_cpu_write(cpu_current_top_of_stack,
+ (unsigned long)task_stack_page(next_p) +
+ THREAD_SIZE);
/*
* Restore %gs if needed (which is common)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 67fcc43..4baaa97 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,7 +52,7 @@
asmlinkage extern void ret_from_fork(void);
-__visible DEFINE_PER_CPU(unsigned long, old_rsp);
+__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
/* Prints also some state that isn't saved in the pt_regs */
void __show_regs(struct pt_regs *regs, int all)
@@ -161,7 +161,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
childregs = task_pt_regs(p);
p->thread.sp = (unsigned long) childregs;
- p->thread.usersp = me->thread.usersp;
set_tsk_thread_flag(p, TIF_FORK);
p->thread.io_bitmap_ptr = NULL;
@@ -207,7 +206,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
*/
if (clone_flags & CLONE_SETTLS) {
#ifdef CONFIG_IA32_EMULATION
- if (test_thread_flag(TIF_IA32))
+ if (is_ia32_task())
err = do_set_thread_area(p, -1,
(struct user_desc __user *)childregs->si, 0);
else
@@ -235,13 +234,12 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
loadsegment(es, _ds);
loadsegment(ds, _ds);
load_gs_index(0);
- current->thread.usersp = new_sp;
regs->ip = new_ip;
regs->sp = new_sp;
- this_cpu_write(old_rsp, new_sp);
regs->cs = _cs;
regs->ss = _ss;
regs->flags = X86_EFLAGS_IF;
+ force_iret();
}
void
@@ -277,15 +275,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct thread_struct *prev = &prev_p->thread;
struct thread_struct *next = &next_p->thread;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
+ struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
unsigned fsindex, gsindex;
fpu_switch_t fpu;
fpu = switch_fpu_prepare(prev_p, next_p, cpu);
- /* Reload esp0 and ss1. */
- load_sp0(tss, next);
-
/* We must save %fs and %gs before load_TLS() because
* %fs and %gs may be cleared by load_TLS().
*
@@ -401,8 +396,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Switch the PDA and FPU contexts.
*/
- prev->usersp = this_cpu_read(old_rsp);
- this_cpu_write(old_rsp, next->usersp);
this_cpu_write(current_task, next_p);
/*
@@ -413,9 +406,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
+ /* Reload esp0 and ss1. This changes current_thread_info(). */
+ load_sp0(tss, next);
+
this_cpu_write(kernel_stack,
- (unsigned long)task_stack_page(next_p) +
- THREAD_SIZE - KERNEL_STACK_OFFSET);
+ (unsigned long)task_stack_page(next_p) + THREAD_SIZE);
/*
* Now maybe reload the debug registers and handle I/O bitmaps
@@ -602,6 +597,5 @@ long sys_arch_prctl(int code, unsigned long addr)
unsigned long KSTK_ESP(struct task_struct *task)
{
- return (test_tsk_thread_flag(task, TIF_IA32)) ?
- (task_pt_regs(task)->sp) : ((task)->thread.usersp);
+ return task_pt_regs(task)->sp;
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index e510618..a7bc794 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -364,18 +364,12 @@ static int set_segment_reg(struct task_struct *task,
case offsetof(struct user_regs_struct,cs):
if (unlikely(value == 0))
return -EIO;
-#ifdef CONFIG_IA32_EMULATION
- if (test_tsk_thread_flag(task, TIF_IA32))
- task_pt_regs(task)->cs = value;
-#endif
+ task_pt_regs(task)->cs = value;
break;
case offsetof(struct user_regs_struct,ss):
if (unlikely(value == 0))
return -EIO;
-#ifdef CONFIG_IA32_EMULATION
- if (test_tsk_thread_flag(task, TIF_IA32))
- task_pt_regs(task)->ss = value;
-#endif
+ task_pt_regs(task)->ss = value;
break;
}
@@ -1421,7 +1415,7 @@ static void fill_sigtrap_info(struct task_struct *tsk,
memset(info, 0, sizeof(*info));
info->si_signo = SIGTRAP;
info->si_code = si_code;
- info->si_addr = user_mode_vm(regs) ? (void __user *)regs->ip : NULL;
+ info->si_addr = user_mode(regs) ? (void __user *)regs->ip : NULL;
}
void user_single_step_siginfo(struct task_struct *tsk,
diff --git a/arch/x86/kernel/pvclock.c b/arch/x86/kernel/pvclock.c
index 2f355d2..e5ecd20 100644
--- a/arch/x86/kernel/pvclock.c
+++ b/arch/x86/kernel/pvclock.c
@@ -141,7 +141,46 @@ void pvclock_read_wallclock(struct pvclock_wall_clock *wall_clock,
set_normalized_timespec(ts, now.tv_sec, now.tv_nsec);
}
+static struct pvclock_vsyscall_time_info *pvclock_vdso_info;
+
+static struct pvclock_vsyscall_time_info *
+pvclock_get_vsyscall_user_time_info(int cpu)
+{
+ if (!pvclock_vdso_info) {
+ BUG();
+ return NULL;
+ }
+
+ return &pvclock_vdso_info[cpu];
+}
+
+struct pvclock_vcpu_time_info *pvclock_get_vsyscall_time_info(int cpu)
+{
+ return &pvclock_get_vsyscall_user_time_info(cpu)->pvti;
+}
+
#ifdef CONFIG_X86_64
+static int pvclock_task_migrate(struct notifier_block *nb, unsigned long l,
+ void *v)
+{
+ struct task_migration_notifier *mn = v;
+ struct pvclock_vsyscall_time_info *pvti;
+
+ pvti = pvclock_get_vsyscall_user_time_info(mn->from_cpu);
+
+ /* this is NULL when pvclock vsyscall is not initialized */
+ if (unlikely(pvti == NULL))
+ return NOTIFY_DONE;
+
+ pvti->migrate_count++;
+
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block pvclock_migrate = {
+ .notifier_call = pvclock_task_migrate,
+};
+
/*
* Initialize the generic pvclock vsyscall state. This will allocate
* a/some page(s) for the per-vcpu pvclock information, set up a
@@ -155,12 +194,17 @@ int __init pvclock_init_vsyscall(struct pvclock_vsyscall_time_info *i,
WARN_ON (size != PVCLOCK_VSYSCALL_NR_PAGES*PAGE_SIZE);
+ pvclock_vdso_info = i;
+
for (idx = 0; idx <= (PVCLOCK_FIXMAP_END-PVCLOCK_FIXMAP_BEGIN); idx++) {
__set_fixmap(PVCLOCK_FIXMAP_BEGIN + idx,
__pa(i) + (idx*PAGE_SIZE),
PAGE_KERNEL_VVAR);
}
+
+ register_task_migration_notifier(&pvclock_migrate);
+
return 0;
}
#endif
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index bae6c60..86db4bc 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -183,6 +183,16 @@ static struct dmi_system_id __initdata reboot_dmi_table[] = {
},
},
+ /* ASRock */
+ { /* Handle problems with rebooting on ASRock Q1900DC-ITX */
+ .callback = set_pci_reboot,
+ .ident = "ASRock Q1900DC-ITX",
+ .matches = {
+ DMI_MATCH(DMI_BOARD_VENDOR, "ASRock"),
+ DMI_MATCH(DMI_BOARD_NAME, "Q1900DC-ITX"),
+ },
+ },
+
/* ASUS */
{ /* Handle problems with rebooting on ASUS P4S800 */
.callback = set_bios_reboot,
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index e13f8e7..77630d5 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -226,23 +226,23 @@ swap_pages:
movl (%ebx), %ecx
addl $4, %ebx
1:
- testl $0x1, %ecx /* is it a destination page */
+ testb $0x1, %cl /* is it a destination page */
jz 2f
movl %ecx, %edi
andl $0xfffff000, %edi
jmp 0b
2:
- testl $0x2, %ecx /* is it an indirection page */
+ testb $0x2, %cl /* is it an indirection page */
jz 2f
movl %ecx, %ebx
andl $0xfffff000, %ebx
jmp 0b
2:
- testl $0x4, %ecx /* is it the done indicator */
+ testb $0x4, %cl /* is it the done indicator */
jz 2f
jmp 3f
2:
- testl $0x8, %ecx /* is it the source indicator */
+ testb $0x8, %cl /* is it the source indicator */
jz 0b /* Ignore it otherwise */
movl %ecx, %esi /* For every source page do a copy */
andl $0xfffff000, %esi
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 3fd2c69..98111b3 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -123,7 +123,7 @@ identity_mapped:
* Set cr4 to a known state:
* - physical address extension enabled
*/
- movq $X86_CR4_PAE, %rax
+ movl $X86_CR4_PAE, %eax
movq %rax, %cr4
jmp 1f
@@ -221,23 +221,23 @@ swap_pages:
movq (%rbx), %rcx
addq $8, %rbx
1:
- testq $0x1, %rcx /* is it a destination page? */
+ testb $0x1, %cl /* is it a destination page? */
jz 2f
movq %rcx, %rdi
andq $0xfffffffffffff000, %rdi
jmp 0b
2:
- testq $0x2, %rcx /* is it an indirection page? */
+ testb $0x2, %cl /* is it an indirection page? */
jz 2f
movq %rcx, %rbx
andq $0xfffffffffffff000, %rbx
jmp 0b
2:
- testq $0x4, %rcx /* is it the done indicator? */
+ testb $0x4, %cl /* is it the done indicator? */
jz 2f
jmp 3f
2:
- testq $0x8, %rcx /* is it the source indicator? */
+ testb $0x8, %cl /* is it the source indicator? */
jz 0b /* Ignore it otherwise */
movq %rcx, %rsi /* For ever source page do a copy */
andq $0xfffffffffffff000, %rsi
@@ -246,17 +246,17 @@ swap_pages:
movq %rsi, %rax
movq %r10, %rdi
- movq $512, %rcx
+ movl $512, %ecx
rep ; movsq
movq %rax, %rdi
movq %rdx, %rsi
- movq $512, %rcx
+ movl $512, %ecx
rep ; movsq
movq %rdx, %rdi
movq %r10, %rsi
- movq $512, %rcx
+ movl $512, %ecx
rep ; movsq
lea PAGE_SIZE(%rax), %rsi
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0a2421c..d74ac33 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -354,7 +354,7 @@ static void __init relocate_initrd(void)
mapaddr = ramdisk_image & PAGE_MASK;
p = early_memremap(mapaddr, clen+slop);
memcpy(q, p+slop, clen);
- early_iounmap(p, clen+slop);
+ early_memunmap(p, clen+slop);
q += clen;
ramdisk_image += clen;
ramdisk_size -= clen;
@@ -438,7 +438,7 @@ static void __init parse_setup_data(void)
data_len = data->len + sizeof(struct setup_data);
data_type = data->type;
pa_next = data->next;
- early_iounmap(data, sizeof(*data));
+ early_memunmap(data, sizeof(*data));
switch (data_type) {
case SETUP_E820_EXT:
@@ -470,7 +470,7 @@ static void __init e820_reserve_setup_data(void)
E820_RAM, E820_RESERVED_KERN);
found = 1;
pa_data = data->next;
- early_iounmap(data, sizeof(*data));
+ early_memunmap(data, sizeof(*data));
}
if (!found)
return;
@@ -491,7 +491,7 @@ static void __init memblock_x86_reserve_range_setup_data(void)
data = early_memremap(pa_data, sizeof(*data));
memblock_reserve(pa_data, sizeof(*data) + data->len);
pa_data = data->next;
- early_iounmap(data, sizeof(*data));
+ early_memunmap(data, sizeof(*data));
}
}
@@ -832,10 +832,15 @@ static void __init trim_low_memory_range(void)
static int
dump_kernel_offset(struct notifier_block *self, unsigned long v, void *p)
{
- pr_emerg("Kernel Offset: 0x%lx from 0x%lx "
- "(relocation range: 0x%lx-0x%lx)\n",
- (unsigned long)&_text - __START_KERNEL, __START_KERNEL,
- __START_KERNEL_map, MODULES_VADDR-1);
+ if (kaslr_enabled()) {
+ pr_emerg("Kernel Offset: 0x%lx from 0x%lx (relocation range: 0x%lx-0x%lx)\n",
+ (unsigned long)&_text - __START_KERNEL,
+ __START_KERNEL,
+ __START_KERNEL_map,
+ MODULES_VADDR-1);
+ } else {
+ pr_emerg("Kernel Offset: disabled\n");
+ }
return 0;
}
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index e504246..3e58186 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -61,8 +61,7 @@
regs->seg = GET_SEG(seg) | 3; \
} while (0)
-int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
- unsigned long *pax)
+int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc)
{
void __user *buf;
unsigned int tmpflags;
@@ -81,7 +80,7 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
#endif /* CONFIG_X86_32 */
COPY(di); COPY(si); COPY(bp); COPY(sp); COPY(bx);
- COPY(dx); COPY(cx); COPY(ip);
+ COPY(dx); COPY(cx); COPY(ip); COPY(ax);
#ifdef CONFIG_X86_64
COPY(r8);
@@ -94,27 +93,20 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
COPY(r15);
#endif /* CONFIG_X86_64 */
-#ifdef CONFIG_X86_32
COPY_SEG_CPL3(cs);
COPY_SEG_CPL3(ss);
-#else /* !CONFIG_X86_32 */
- /* Kernel saves and restores only the CS segment register on signals,
- * which is the bare minimum needed to allow mixed 32/64-bit code.
- * App's signal handler can save/restore other segments if needed. */
- COPY_SEG_CPL3(cs);
-#endif /* CONFIG_X86_32 */
get_user_ex(tmpflags, &sc->flags);
regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
regs->orig_ax = -1; /* disable syscall checks */
get_user_ex(buf, &sc->fpstate);
-
- get_user_ex(*pax, &sc->ax);
} get_user_catch(err);
err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32));
+ force_iret();
+
return err;
}
@@ -162,8 +154,9 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
#else /* !CONFIG_X86_32 */
put_user_ex(regs->flags, &sc->flags);
put_user_ex(regs->cs, &sc->cs);
- put_user_ex(0, &sc->gs);
- put_user_ex(0, &sc->fs);
+ put_user_ex(0, &sc->__pad2);
+ put_user_ex(0, &sc->__pad1);
+ put_user_ex(regs->ss, &sc->ss);
#endif /* CONFIG_X86_32 */
put_user_ex(fpstate, &sc->fpstate);
@@ -457,9 +450,19 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
regs->sp = (unsigned long)frame;
- /* Set up the CS register to run signal handlers in 64-bit mode,
- even if the handler happens to be interrupting 32-bit code. */
+ /*
+ * Set up the CS and SS registers to run signal handlers in
+ * 64-bit mode, even if the handler happens to be interrupting
+ * 32-bit or 16-bit code.
+ *
+ * SS is subtle. In 64-bit mode, we don't need any particular
+ * SS descriptor, but we do need SS to be valid. It's possible
+ * that the old SS is entirely bogus -- this can happen if the
+ * signal we're trying to deliver is #GP or #SS caused by a bad
+ * SS value.
+ */
regs->cs = __USER_CS;
+ regs->ss = __USER_DS;
return 0;
}
@@ -539,7 +542,6 @@ asmlinkage unsigned long sys_sigreturn(void)
{
struct pt_regs *regs = current_pt_regs();
struct sigframe __user *frame;
- unsigned long ax;
sigset_t set;
frame = (struct sigframe __user *)(regs->sp - 8);
@@ -553,9 +555,9 @@ asmlinkage unsigned long sys_sigreturn(void)
set_current_blocked(&set);
- if (restore_sigcontext(regs, &frame->sc, &ax))
+ if (restore_sigcontext(regs, &frame->sc))
goto badframe;
- return ax;
+ return regs->ax;
badframe:
signal_fault(regs, frame, "sigreturn");
@@ -568,7 +570,6 @@ asmlinkage long sys_rt_sigreturn(void)
{
struct pt_regs *regs = current_pt_regs();
struct rt_sigframe __user *frame;
- unsigned long ax;
sigset_t set;
frame = (struct rt_sigframe __user *)(regs->sp - sizeof(long));
@@ -579,13 +580,13 @@ asmlinkage long sys_rt_sigreturn(void)
set_current_blocked(&set);
- if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
+ if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
goto badframe;
if (restore_altstack(&frame->uc.uc_stack))
goto badframe;
- return ax;
+ return regs->ax;
badframe:
signal_fault(regs, frame, "rt_sigreturn");
@@ -679,7 +680,7 @@ handle_signal(struct ksignal *ksig, struct pt_regs *regs)
* Ensure the signal handler starts with the new fpu state.
*/
if (used_math())
- drop_init_fpu(current);
+ fpu_reset_state(current);
}
signal_setup_done(failed, ksig, test_thread_flag(TIF_SINGLESTEP));
}
@@ -780,7 +781,6 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
struct pt_regs *regs = current_pt_regs();
struct rt_sigframe_x32 __user *frame;
sigset_t set;
- unsigned long ax;
frame = (struct rt_sigframe_x32 __user *)(regs->sp - 8);
@@ -791,13 +791,13 @@ asmlinkage long sys32_x32_rt_sigreturn(void)
set_current_blocked(&set);
- if (restore_sigcontext(regs, &frame->uc.uc_mcontext, &ax))
+ if (restore_sigcontext(regs, &frame->uc.uc_mcontext))
goto badframe;
if (compat_restore_altstack(&frame->uc.uc_stack))
goto badframe;
- return ax;
+ return regs->ax;
badframe:
signal_fault(regs, frame, "x32 rt_sigreturn");
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index febc6aa..50e547e 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -77,9 +77,6 @@
#include <asm/realmode.h>
#include <asm/misc.h>
-/* State of each CPU */
-DEFINE_PER_CPU(int, cpu_state) = { 0 };
-
/* Number of siblings per CPU package */
int smp_num_siblings = 1;
EXPORT_SYMBOL(smp_num_siblings);
@@ -257,7 +254,7 @@ static void notrace start_secondary(void *unused)
lock_vector_lock();
set_cpu_online(smp_processor_id(), true);
unlock_vector_lock();
- per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+ cpu_set_state_online(smp_processor_id());
x86_platform.nmi_init();
/* enable local interrupts */
@@ -779,6 +776,26 @@ out:
return boot_error;
}
+void common_cpu_up(unsigned int cpu, struct task_struct *idle)
+{
+ /* Just in case we booted with a single CPU. */
+ alternatives_enable_smp();
+
+ per_cpu(current_task, cpu) = idle;
+
+#ifdef CONFIG_X86_32
+ /* Stack for startup_32 can be just as for start_secondary onwards */
+ irq_ctx_init(cpu);
+ per_cpu(cpu_current_top_of_stack, cpu) =
+ (unsigned long)task_stack_page(idle) + THREAD_SIZE;
+#else
+ clear_tsk_thread_flag(idle, TIF_FORK);
+ initial_gs = per_cpu_offset(cpu);
+#endif
+ per_cpu(kernel_stack, cpu) =
+ (unsigned long)task_stack_page(idle) + THREAD_SIZE;
+}
+
/*
* NOTE - on most systems this is a PHYSICAL apic ID, but on multiquad
* (ie clustered apic addressing mode), this is a LOGICAL apic ID.
@@ -796,23 +813,9 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
int cpu0_nmi_registered = 0;
unsigned long timeout;
- /* Just in case we booted with a single CPU. */
- alternatives_enable_smp();
-
idle->thread.sp = (unsigned long) (((struct pt_regs *)
(THREAD_SIZE + task_stack_page(idle))) - 1);
- per_cpu(current_task, cpu) = idle;
-#ifdef CONFIG_X86_32
- /* Stack for startup_32 can be just as for start_secondary onwards */
- irq_ctx_init(cpu);
-#else
- clear_tsk_thread_flag(idle, TIF_FORK);
- initial_gs = per_cpu_offset(cpu);
-#endif
- per_cpu(kernel_stack, cpu) =
- (unsigned long)task_stack_page(idle) -
- KERNEL_STACK_OFFSET + THREAD_SIZE;
early_gdt_descr.address = (unsigned long)get_cpu_gdt_table(cpu);
initial_code = (unsigned long)start_secondary;
stack_start = idle->thread.sp;
@@ -948,11 +951,16 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
*/
mtrr_save_state();
- per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+ /* x86 CPUs take themselves offline, so delayed offline is OK. */
+ err = cpu_check_up_prepare(cpu);
+ if (err && err != -EBUSY)
+ return err;
/* the FPU context is blank, nobody can own it */
__cpu_disable_lazy_restore(cpu);
+ common_cpu_up(cpu, tidle);
+
err = do_boot_cpu(apicid, cpu, tidle);
if (err) {
pr_err("do_boot_cpu failed(%d) to wakeup CPU#%u\n", err, cpu);
@@ -1086,8 +1094,6 @@ static int __init smp_sanity_check(unsigned max_cpus)
return SMP_NO_APIC;
}
- verify_local_APIC();
-
/*
* If SMP should be disabled, then really disable it!
*/
@@ -1191,7 +1197,7 @@ void __init native_smp_prepare_boot_cpu(void)
switch_to_new_gdt(me);
/* already set me in cpu_online_mask in boot_cpu_init() */
cpumask_set_cpu(me, cpu_callout_mask);
- per_cpu(cpu_state, me) = CPU_ONLINE;
+ cpu_set_state_online(me);
}
void __init native_smp_cpus_done(unsigned int max_cpus)
@@ -1318,14 +1324,10 @@ static void __ref remove_cpu_from_maps(int cpu)
numa_remove_cpu(cpu);
}
-static DEFINE_PER_CPU(struct completion, die_complete);
-
void cpu_disable_common(void)
{
int cpu = smp_processor_id();
- init_completion(&per_cpu(die_complete, smp_processor_id()));
-
remove_siblinginfo(cpu);
/* It's now safe to remove this processor from the online map */
@@ -1349,24 +1351,27 @@ int native_cpu_disable(void)
return 0;
}
-void cpu_die_common(unsigned int cpu)
+int common_cpu_die(unsigned int cpu)
{
- wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ);
-}
+ int ret = 0;
-void native_cpu_die(unsigned int cpu)
-{
/* We don't do anything here: idle task is faking death itself. */
- cpu_die_common(cpu);
-
/* They ack this in play_dead() by setting CPU_DEAD */
- if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+ if (cpu_wait_death(cpu, 5)) {
if (system_state == SYSTEM_RUNNING)
pr_info("CPU %u is now offline\n", cpu);
} else {
pr_err("CPU %u didn't die...\n", cpu);
+ ret = -1;
}
+
+ return ret;
+}
+
+void native_cpu_die(unsigned int cpu)
+{
+ common_cpu_die(cpu);
}
void play_dead_common(void)
@@ -1375,10 +1380,8 @@ void play_dead_common(void)
reset_lazy_tlbstate();
amd_e400_remove_cpu(raw_smp_processor_id());
- mb();
/* Ack it */
- __this_cpu_write(cpu_state, CPU_DEAD);
- complete(&per_cpu(die_complete, smp_processor_id()));
+ (void)cpu_report_death();
/*
* With physical CPU hotplug, we should halt the cpu
diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c
index 30277e2..10e0272 100644
--- a/arch/x86/kernel/sys_x86_64.c
+++ b/arch/x86/kernel/sys_x86_64.c
@@ -34,10 +34,26 @@ static unsigned long get_align_mask(void)
return va_align.mask;
}
+/*
+ * To avoid aliasing in the I$ on AMD F15h, the bits defined by the
+ * va_align.bits, [12:upper_bit), are set to a random value instead of
+ * zeroing them. This random value is computed once per boot. This form
+ * of ASLR is known as "per-boot ASLR".
+ *
+ * To achieve this, the random value is added to the info.align_offset
+ * value before calling vm_unmapped_area() or ORed directly to the
+ * address.
+ */
+static unsigned long get_align_bits(void)
+{
+ return va_align.bits & get_align_mask();
+}
+
unsigned long align_vdso_addr(unsigned long addr)
{
unsigned long align_mask = get_align_mask();
- return (addr + align_mask) & ~align_mask;
+ addr = (addr + align_mask) & ~align_mask;
+ return addr | get_align_bits();
}
static int __init control_va_addr_alignment(char *str)
@@ -135,8 +151,12 @@ arch_get_unmapped_area(struct file *filp, unsigned long addr,
info.length = len;
info.low_limit = begin;
info.high_limit = end;
- info.align_mask = filp ? get_align_mask() : 0;
+ info.align_mask = 0;
info.align_offset = pgoff << PAGE_SHIFT;
+ if (filp) {
+ info.align_mask = get_align_mask();
+ info.align_offset += get_align_bits();
+ }
return vm_unmapped_area(&info);
}
@@ -174,8 +194,12 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0,
info.length = len;
info.low_limit = PAGE_SIZE;
info.high_limit = mm->mmap_base;
- info.align_mask = filp ? get_align_mask() : 0;
+ info.align_mask = 0;
info.align_offset = pgoff << PAGE_SHIFT;
+ if (filp) {
+ info.align_mask = get_align_mask();
+ info.align_offset += get_align_bits();
+ }
addr = vm_unmapped_area(&info);
if (!(addr & ~PAGE_MASK))
return addr;
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
index e9bcd57..3777189 100644
--- a/arch/x86/kernel/syscall_32.c
+++ b/arch/x86/kernel/syscall_32.c
@@ -5,21 +5,29 @@
#include <linux/cache.h>
#include <asm/asm-offsets.h>
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
+#ifdef CONFIG_IA32_EMULATION
+#define SYM(sym, compat) compat
+#else
+#define SYM(sym, compat) sym
+#define ia32_sys_call_table sys_call_table
+#define __NR_ia32_syscall_max __NR_syscall_max
+#endif
+
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ;
#include <asm/syscalls_32.h>
#undef __SYSCALL_I386
-#define __SYSCALL_I386(nr, sym, compat) [nr] = sym,
+#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
typedef asmlinkage void (*sys_call_ptr_t)(void);
extern asmlinkage void sys_ni_syscall(void);
-__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
+__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
/*
* Smells like a compiler bug -- it doesn't work
* when the & below is removed.
*/
- [0 ... __NR_syscall_max] = &sys_ni_syscall,
+ [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall,
#include <asm/syscalls_32.h>
};
diff --git a/arch/x86/kernel/test_rodata.c b/arch/x86/kernel/test_rodata.c
index b79133a..5ecbfe5 100644
--- a/arch/x86/kernel/test_rodata.c
+++ b/arch/x86/kernel/test_rodata.c
@@ -57,7 +57,7 @@ int rodata_test(void)
/* test 3: check the value hasn't changed */
/* If this test fails, we managed to overwrite the data */
if (!rodata_test_data) {
- printk(KERN_ERR "rodata_test: Test 3 failes (end data)\n");
+ printk(KERN_ERR "rodata_test: Test 3 fails (end data)\n");
return -ENODEV;
}
/* test 4: check if the rodata section is 4Kb aligned */
diff --git a/arch/x86/kernel/time.c b/arch/x86/kernel/time.c
index 25adc0e..d39c091 100644
--- a/arch/x86/kernel/time.c
+++ b/arch/x86/kernel/time.c
@@ -30,7 +30,7 @@ unsigned long profile_pc(struct pt_regs *regs)
{
unsigned long pc = instruction_pointer(regs);
- if (!user_mode_vm(regs) && in_lock_functions(pc)) {
+ if (!user_mode(regs) && in_lock_functions(pc)) {
#ifdef CONFIG_FRAME_POINTER
return *(unsigned long *)(regs->bp + sizeof(long));
#else
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4ff5d16..324ab52 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -112,7 +112,7 @@ enum ctx_state ist_enter(struct pt_regs *regs)
{
enum ctx_state prev_state;
- if (user_mode_vm(regs)) {
+ if (user_mode(regs)) {
/* Other than that, we're just an exception. */
prev_state = exception_enter();
} else {
@@ -123,7 +123,7 @@ enum ctx_state ist_enter(struct pt_regs *regs)
* but we need to notify RCU.
*/
rcu_nmi_enter();
- prev_state = IN_KERNEL; /* the value is irrelevant. */
+ prev_state = CONTEXT_KERNEL; /* the value is irrelevant. */
}
/*
@@ -146,7 +146,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
/* Must be before exception_exit. */
preempt_count_sub(HARDIRQ_OFFSET);
- if (user_mode_vm(regs))
+ if (user_mode(regs))
return exception_exit(prev_state);
else
rcu_nmi_exit();
@@ -158,7 +158,7 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
*
* IST exception handlers normally cannot schedule. As a special
* exception, if the exception interrupted userspace code (i.e.
- * user_mode_vm(regs) would return true) and the exception was not
+ * user_mode(regs) would return true) and the exception was not
* a double fault, it can be safe to schedule. ist_begin_non_atomic()
* begins a non-atomic section within an ist_enter()/ist_exit() region.
* Callers are responsible for enabling interrupts themselves inside
@@ -167,15 +167,15 @@ void ist_exit(struct pt_regs *regs, enum ctx_state prev_state)
*/
void ist_begin_non_atomic(struct pt_regs *regs)
{
- BUG_ON(!user_mode_vm(regs));
+ BUG_ON(!user_mode(regs));
/*
* Sanity check: we need to be on the normal thread stack. This
* will catch asm bugs and any attempt to use ist_preempt_enable
* from double_fault.
*/
- BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack))
- & ~(THREAD_SIZE - 1)) != 0);
+ BUG_ON((unsigned long)(current_top_of_stack() -
+ current_stack_pointer()) >= THREAD_SIZE);
preempt_count_sub(HARDIRQ_OFFSET);
}
@@ -194,8 +194,7 @@ static nokprobe_inline int
do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
struct pt_regs *regs, long error_code)
{
-#ifdef CONFIG_X86_32
- if (regs->flags & X86_VM_MASK) {
+ if (v8086_mode(regs)) {
/*
* Traps 0, 1, 3, 4, and 5 should be forwarded to vm86.
* On nmi (interrupt 2), do_trap should not be called.
@@ -207,7 +206,7 @@ do_trap_no_signal(struct task_struct *tsk, int trapnr, char *str,
}
return -1;
}
-#endif
+
if (!user_mode(regs)) {
if (!fixup_exception(regs)) {
tsk->thread.error_code = error_code;
@@ -384,7 +383,7 @@ dotraplinkage void do_bounds(struct pt_regs *regs, long error_code)
goto exit;
conditional_sti(regs);
- if (!user_mode_vm(regs))
+ if (!user_mode(regs))
die("bounds", regs, error_code);
if (!cpu_feature_enabled(X86_FEATURE_MPX)) {
@@ -462,13 +461,11 @@ do_general_protection(struct pt_regs *regs, long error_code)
prev_state = exception_enter();
conditional_sti(regs);
-#ifdef CONFIG_X86_32
- if (regs->flags & X86_VM_MASK) {
+ if (v8086_mode(regs)) {
local_irq_enable();
handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
goto exit;
}
-#endif
tsk = current;
if (!user_mode(regs)) {
@@ -587,7 +584,7 @@ struct bad_iret_stack *fixup_bad_iret(struct bad_iret_stack *s)
/* Copy the remainder of the stack from the current stack. */
memmove(new_stack, s, offsetof(struct bad_iret_stack, regs.ip));
- BUG_ON(!user_mode_vm(&new_stack->regs));
+ BUG_ON(!user_mode(&new_stack->regs));
return new_stack;
}
NOKPROBE_SYMBOL(fixup_bad_iret);
@@ -637,7 +634,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
* then it's very likely the result of an icebp/int01 trap.
* User wants a sigtrap for that.
*/
- if (!dr6 && user_mode_vm(regs))
+ if (!dr6 && user_mode(regs))
user_icebp = 1;
/* Catch kmemcheck conditions first of all! */
@@ -673,7 +670,7 @@ dotraplinkage void do_debug(struct pt_regs *regs, long error_code)
/* It's safe to allow irq's after DR6 has been saved */
preempt_conditional_sti(regs);
- if (regs->flags & X86_VM_MASK) {
+ if (v8086_mode(regs)) {
handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code,
X86_TRAP_DB);
preempt_conditional_cli(regs);
@@ -721,7 +718,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
return;
conditional_sti(regs);
- if (!user_mode_vm(regs))
+ if (!user_mode(regs))
{
if (!fixup_exception(regs)) {
task->thread.error_code = error_code;
@@ -734,7 +731,7 @@ static void math_error(struct pt_regs *regs, int error_code, int trapnr)
/*
* Save the info for the exception handler and clear the error.
*/
- save_init_fpu(task);
+ unlazy_fpu(task);
task->thread.trap_nr = trapnr;
task->thread.error_code = error_code;
info.si_signo = SIGFPE;
@@ -863,7 +860,7 @@ void math_state_restore(void)
kernel_fpu_disable();
__thread_fpu_begin(tsk);
if (unlikely(restore_fpu_checking(tsk))) {
- drop_init_fpu(tsk);
+ fpu_reset_state(tsk);
force_sig_info(SIGSEGV, SEND_SIG_PRIV, tsk);
} else {
tsk->thread.fpu_counter++;
@@ -925,9 +922,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
/* Set of traps needed for early debugging. */
void __init early_trap_init(void)
{
- set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
+ /*
+ * Don't use IST to set DEBUG_STACK as it doesn't work until TSS
+ * is ready in cpu_init() <-- trap_init(). Before trap_init(),
+ * CPU runs at ring 0 so it is impossible to hit an invalid
+ * stack. Using the original stack works well enough at this
+ * early stage. DEBUG_STACK will be equipped after cpu_init() in
+ * trap_init().
+ *
+ * We don't need to set trace_idt_table like set_intr_gate(),
+ * since we don't have trace_debug and it will be reset to
+ * 'debug' in trap_init() by set_intr_gate_ist().
+ */
+ set_intr_gate_notrace(X86_TRAP_DB, debug);
/* int3 can be called from all */
- set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+ set_system_intr_gate(X86_TRAP_BP, &int3);
#ifdef CONFIG_X86_32
set_intr_gate(X86_TRAP_PF, page_fault);
#endif
@@ -1005,6 +1014,15 @@ void __init trap_init(void)
*/
cpu_init();
+ /*
+ * X86_TRAP_DB and X86_TRAP_BP have been set
+ * in early_trap_init(). However, ITS works only after
+ * cpu_init() loads TSS. See comments in early_trap_init().
+ */
+ set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
+ /* int3 can be called from all */
+ set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+
x86_init.irqs.trap_init();
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 81f8adb0..0b81ad6 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -912,7 +912,7 @@ int arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val,
int ret = NOTIFY_DONE;
/* We are only interested in userspace traps */
- if (regs && !user_mode_vm(regs))
+ if (regs && !user_mode(regs))
return NOTIFY_DONE;
switch (val) {
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index e8edcf5..fc9db6e 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -150,7 +150,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
do_exit(SIGSEGV);
}
- tss = &per_cpu(init_tss, get_cpu());
+ tss = &per_cpu(cpu_tss, get_cpu());
current->thread.sp0 = current->thread.saved_sp0;
current->thread.sysenter_cs = __KERNEL_CS;
load_sp0(tss, &current->thread);
@@ -318,7 +318,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
tsk->thread.saved_fs = info->regs32->fs;
tsk->thread.saved_gs = get_user_gs(info->regs32);
- tss = &per_cpu(init_tss, get_cpu());
+ tss = &per_cpu(cpu_tss, get_cpu());
tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
if (cpu_has_sep)
tsk->thread.sysenter_cs = 0;
diff --git a/arch/x86/kernel/xsave.c b/arch/x86/kernel/xsave.c
index cdc6cf9..87a815b 100644
--- a/arch/x86/kernel/xsave.c
+++ b/arch/x86/kernel/xsave.c
@@ -342,7 +342,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
config_enabled(CONFIG_IA32_EMULATION));
if (!buf) {
- drop_init_fpu(tsk);
+ fpu_reset_state(tsk);
return 0;
}
@@ -416,7 +416,7 @@ int __restore_xstate_sig(void __user *buf, void __user *buf_fx, int size)
*/
user_fpu_begin();
if (restore_user_xstate(buf_fx, xstate_bv, fx_only)) {
- drop_init_fpu(tsk);
+ fpu_reset_state(tsk);
return -1;
}
}
@@ -678,19 +678,13 @@ void xsave_init(void)
this_func();
}
-static inline void __init eager_fpu_init_bp(void)
-{
- current->thread.fpu.state =
- alloc_bootmem_align(xstate_size, __alignof__(struct xsave_struct));
- if (!init_xstate_buf)
- setup_init_fpu_buf();
-}
-
-void eager_fpu_init(void)
+/*
+ * setup_init_fpu_buf() is __init and it is OK to call it here because
+ * init_xstate_buf will be unset only once during boot.
+ */
+void __init_refok eager_fpu_init(void)
{
- static __refdata void (*boot_func)(void) = eager_fpu_init_bp;
-
- clear_used_math();
+ WARN_ON(used_math());
current_thread_info()->status = 0;
if (eagerfpu == ENABLE)
@@ -701,21 +695,8 @@ void eager_fpu_init(void)
return;
}
- if (boot_func) {
- boot_func();
- boot_func = NULL;
- }
-
- /*
- * This is same as math_state_restore(). But use_xsave() is
- * not yet patched to use math_state_restore().
- */
- init_fpu(current);
- __thread_fpu_begin(current);
- if (cpu_has_xsave)
- xrstor_state(init_xstate_buf, -1);
- else
- fxrstor_checking(&init_xstate_buf->i387);
+ if (!init_xstate_buf)
+ setup_init_fpu_buf();
}
/*
OpenPOWER on IntegriCloud