From 682367c494869008eb89ef733f196e99415ae862 Mon Sep 17 00:00:00 2001 From: Nadav Amit Date: Wed, 18 Jun 2014 17:21:19 +0300 Subject: KVM: x86: Increase the number of fixed MTRR regs to 10 Recent Intel CPUs have 10 variable range MTRRs. Since operating systems sometime make assumptions on CPUs while they ignore capability MSRs, it is better for KVM to be consistent with recent CPUs. Reporting more MTRRs than actually supported has no functional implications. Signed-off-by: Nadav Amit Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 4931415..0bab29d 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -95,7 +95,7 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) #define KVM_REFILL_PAGES 25 #define KVM_MAX_CPUID_ENTRIES 80 #define KVM_NR_FIXED_MTRR_REGION 88 -#define KVM_NR_VAR_MTRR 8 +#define KVM_NR_VAR_MTRR 10 #define ASYNC_PF_PER_VCPU 64 -- cgit v1.1 From e1fa108d24697b78348fd4e5a531029a50d0d36d Mon Sep 17 00:00:00 2001 From: Xiaoming Gao Date: Thu, 19 Jun 2014 19:14:57 +0800 Subject: kvm: fix wrong address when writing Hyper-V tsc page When kvm_write_guest writes the tsc_ref structure to the guest, or it will lead the low HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT bits of the TSC page address must be cleared, or the guest can see a non-zero sequence number. Otherwise Windows guests would not be able to get a correct clocksource (QueryPerformanceCounter will always return 0) which causes serious chaos. Signed-off-by: Xiaoming Gao Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/kvm/x86.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f32a025..f644933 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1898,7 +1898,7 @@ static int set_msr_hyperv_pw(struct kvm_vcpu *vcpu, u32 msr, u64 data) if (!(data & HV_X64_MSR_TSC_REFERENCE_ENABLE)) break; gfn = data >> HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT; - if (kvm_write_guest(kvm, data, + if (kvm_write_guest(kvm, gfn << HV_X64_MSR_TSC_REFERENCE_ADDRESS_SHIFT, &tsc_ref, sizeof(tsc_ref))) return 1; mark_page_dirty(kvm, gfn); -- cgit v1.1 From 7cb060a91c0efc5ff94f83c6df3ed705e143cdb9 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 19 Jun 2014 11:40:18 +0200 Subject: KVM: x86: preserve the high 32-bits of the PAT register KVM does not really do much with the PAT, so this went unnoticed for a long time. It is exposed however if you try to do rdmsr on the PAT register. Reported-by: Valentine Sinitsyn Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini --- arch/x86/include/asm/kvm_host.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 0bab29d..49205d0 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -461,7 +461,7 @@ struct kvm_vcpu_arch { bool nmi_injected; /* Trying to inject an NMI this entry */ struct mtrr_state_type mtrr_state; - u32 pat; + u64 pat; unsigned switch_db_regs; unsigned long db[KVM_NR_DB_REGS]; -- cgit v1.1 From 5f56e7167e6d438324fcba87018255d81e201383 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 18 Jun 2014 15:59:46 -0700 Subject: x86/vdso: Discard the __bug_table section It serves no purpose in user code. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/2a5bebff42defd8a5e81d96f7dc00f21143c80e8.1403129369.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vdso-layout.lds.S | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S index 2ec72f6..c84166c 100644 --- a/arch/x86/vdso/vdso-layout.lds.S +++ b/arch/x86/vdso/vdso-layout.lds.S @@ -75,6 +75,7 @@ SECTIONS /DISCARD/ : { *(.discard) *(.discard.*) + *(__bug_table) } } -- cgit v1.1 From c1979c370273fd9f7326ffa27a63b9ddb0f495f4 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 18 Jun 2014 15:59:47 -0700 Subject: x86/vdso2c: Use better macros for ELF bitness Rather than using a separate macro for each replacement, use generic macros. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/d953cd2e70ceee1400985d091188cdd65fba2f05.1403129369.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vdso2c.c | 42 +++++++++++++----------------------------- arch/x86/vdso/vdso2c.h | 23 ++++++++++++----------- 2 files changed, 25 insertions(+), 40 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c index 7a6bf50..7343899 100644 --- a/arch/x86/vdso/vdso2c.c +++ b/arch/x86/vdso/vdso2c.c @@ -83,37 +83,21 @@ extern void bad_put_le(void); #define NSYMS (sizeof(required_syms) / sizeof(required_syms[0])) -#define BITS 64 -#define GOFUNC go64 -#define Elf_Ehdr Elf64_Ehdr -#define Elf_Shdr Elf64_Shdr -#define Elf_Phdr Elf64_Phdr -#define Elf_Sym Elf64_Sym -#define Elf_Dyn Elf64_Dyn +#define BITSFUNC3(name, bits) name##bits +#define BITSFUNC2(name, bits) BITSFUNC3(name, bits) +#define BITSFUNC(name) BITSFUNC2(name, ELF_BITS) + +#define ELF_BITS_XFORM2(bits, x) Elf##bits##_##x +#define ELF_BITS_XFORM(bits, x) ELF_BITS_XFORM2(bits, x) +#define ELF(x) ELF_BITS_XFORM(ELF_BITS, x) + +#define ELF_BITS 64 #include "vdso2c.h" -#undef BITS -#undef GOFUNC -#undef Elf_Ehdr -#undef Elf_Shdr -#undef Elf_Phdr -#undef Elf_Sym -#undef Elf_Dyn - -#define BITS 32 -#define GOFUNC go32 -#define Elf_Ehdr Elf32_Ehdr -#define Elf_Shdr Elf32_Shdr -#define Elf_Phdr Elf32_Phdr -#define Elf_Sym Elf32_Sym -#define Elf_Dyn Elf32_Dyn +#undef ELF_BITS + +#define ELF_BITS 32 #include "vdso2c.h" -#undef BITS -#undef GOFUNC -#undef Elf_Ehdr -#undef Elf_Shdr -#undef Elf_Phdr -#undef Elf_Sym -#undef Elf_Dyn +#undef ELF_BITS static void go(void *addr, size_t len, FILE *outfile, const char *name) { diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h index c6eefaf..8e185ce3 100644 --- a/arch/x86/vdso/vdso2c.h +++ b/arch/x86/vdso/vdso2c.h @@ -4,23 +4,24 @@ * are built for 32-bit userspace. */ -static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) +static void BITSFUNC(go)(void *addr, size_t len, + FILE *outfile, const char *name) { int found_load = 0; unsigned long load_size = -1; /* Work around bogus warning */ unsigned long data_size; - Elf_Ehdr *hdr = (Elf_Ehdr *)addr; + ELF(Ehdr) *hdr = (ELF(Ehdr) *)addr; int i; unsigned long j; - Elf_Shdr *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, + ELF(Shdr) *symtab_hdr = NULL, *strtab_hdr, *secstrings_hdr, *alt_sec = NULL; - Elf_Dyn *dyn = 0, *dyn_end = 0; + ELF(Dyn) *dyn = 0, *dyn_end = 0; const char *secstrings; uint64_t syms[NSYMS] = {}; uint64_t fake_sections_value = 0, fake_sections_size = 0; - Elf_Phdr *pt = (Elf_Phdr *)(addr + GET_LE(&hdr->e_phoff)); + ELF(Phdr) *pt = (ELF(Phdr) *)(addr + GET_LE(&hdr->e_phoff)); /* Walk the segment table. */ for (i = 0; i < GET_LE(&hdr->e_phnum); i++) { @@ -61,7 +62,7 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) GET_LE(&hdr->e_shentsize)*GET_LE(&hdr->e_shstrndx); secstrings = addr + GET_LE(&secstrings_hdr->sh_offset); for (i = 0; i < GET_LE(&hdr->e_shnum); i++) { - Elf_Shdr *sh = addr + GET_LE(&hdr->e_shoff) + + ELF(Shdr) *sh = addr + GET_LE(&hdr->e_shoff) + GET_LE(&hdr->e_shentsize) * i; if (GET_LE(&sh->sh_type) == SHT_SYMTAB) symtab_hdr = sh; @@ -82,7 +83,7 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) i < GET_LE(&symtab_hdr->sh_size) / GET_LE(&symtab_hdr->sh_entsize); i++) { int k; - Elf_Sym *sym = addr + GET_LE(&symtab_hdr->sh_offset) + + ELF(Sym) *sym = addr + GET_LE(&symtab_hdr->sh_offset) + GET_LE(&symtab_hdr->sh_entsize) * i; const char *name = addr + GET_LE(&strtab_hdr->sh_offset) + GET_LE(&sym->st_name); @@ -123,12 +124,12 @@ static void GOFUNC(void *addr, size_t len, FILE *outfile, const char *name) fail("end_mapping must be a multiple of 4096\n"); /* Remove sections or use fakes */ - if (fake_sections_size % sizeof(Elf_Shdr)) + if (fake_sections_size % sizeof(ELF(Shdr))) fail("vdso_fake_sections size is not a multiple of %ld\n", - (long)sizeof(Elf_Shdr)); + (long)sizeof(ELF(Shdr))); PUT_LE(&hdr->e_shoff, fake_sections_value); - PUT_LE(&hdr->e_shentsize, fake_sections_value ? sizeof(Elf_Shdr) : 0); - PUT_LE(&hdr->e_shnum, fake_sections_size / sizeof(Elf_Shdr)); + PUT_LE(&hdr->e_shentsize, fake_sections_value ? sizeof(ELF(Shdr)) : 0); + PUT_LE(&hdr->e_shnum, fake_sections_size / sizeof(ELF(Shdr))); PUT_LE(&hdr->e_shstrndx, SHN_UNDEF); if (!name) { -- cgit v1.1 From bfad381c0d1e19cae8461e105d8d4387dd2a14fe Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 18 Jun 2014 15:59:48 -0700 Subject: x86/vdso: Improve the fake section headers Fully stripping the vDSO has other unfortunate side effects: - binutils is unable to find ELF notes without a SHT_NOTE section. - Even elfutils has trouble: it can find ELF notes without a section table at all, but if a section table is present, it won't look for PT_NOTE. - gdb wants section names to match between stripped DSOs and their symbols; otherwise it will corrupt symbol addresses. We're also breaking the rules: section 0 is supposed to be SHT_NULL. Fix these problems by building a better fake section table. While we're at it, we might as well let buggy Go versions keep working well by giving the SHT_DYNSYM entry the correct size. This is a bit unfortunate: it adds quite a bit of size to the vdso image. If/when binutils improves and the improved versions become widespread, it would be worth considering dropping most of this. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/0e546a5eeaafdf1840e6ee654a55c1e727c26663.1403129369.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/Makefile | 4 +- arch/x86/vdso/vdso-fakesections.c | 44 ++++---- arch/x86/vdso/vdso-layout.lds.S | 40 +++++-- arch/x86/vdso/vdso.lds.S | 2 + arch/x86/vdso/vdso2c.c | 31 ++++-- arch/x86/vdso/vdso2c.h | 180 +++++++++++++++++++++++++++---- arch/x86/vdso/vdso32/vdso-fakesections.c | 1 + arch/x86/vdso/vdsox32.lds.S | 2 + 8 files changed, 237 insertions(+), 67 deletions(-) create mode 100644 arch/x86/vdso/vdso32/vdso-fakesections.c (limited to 'arch/x86') diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 3c0809a..2c1ca98 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -11,7 +11,6 @@ VDSO32-$(CONFIG_COMPAT) := y # files to link into the vdso vobjs-y := vdso-note.o vclock_gettime.o vgetcpu.o vdso-fakesections.o -vobjs-nox32 := vdso-fakesections.o # files to link into kernel obj-y += vma.o @@ -134,7 +133,7 @@ override obj-dirs = $(dir $(obj)) $(obj)/vdso32/ targets += vdso32/vdso32.lds targets += vdso32/note.o vdso32/vclock_gettime.o $(vdso32.so-y:%=vdso32/%.o) -targets += vdso32/vclock_gettime.o +targets += vdso32/vclock_gettime.o vdso32/vdso-fakesections.o $(obj)/vdso32.o: $(vdso32-images:%=$(obj)/%) @@ -155,6 +154,7 @@ $(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) $(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ $(obj)/vdso32/vdso32.lds \ $(obj)/vdso32/vclock_gettime.o \ + $(obj)/vdso32/vdso-fakesections.o \ $(obj)/vdso32/note.o \ $(obj)/vdso32/%.o $(call if_changed,vdso) diff --git a/arch/x86/vdso/vdso-fakesections.c b/arch/x86/vdso/vdso-fakesections.c index cb8a8d7..56927a7 100644 --- a/arch/x86/vdso/vdso-fakesections.c +++ b/arch/x86/vdso/vdso-fakesections.c @@ -2,31 +2,23 @@ * Copyright 2014 Andy Lutomirski * Subject to the GNU Public License, v.2 * - * Hack to keep broken Go programs working. - * - * The Go runtime had a couple of bugs: it would read the section table to try - * to figure out how many dynamic symbols there were (it shouldn't have looked - * at the section table at all) and, if there were no SHT_SYNDYM section table - * entry, it would use an uninitialized value for the number of symbols. As a - * workaround, we supply a minimal section table. vdso2c will adjust the - * in-memory image so that "vdso_fake_sections" becomes the section table. - * - * The bug was introduced by: - * https://code.google.com/p/go/source/detail?r=56ea40aac72b (2012-08-31) - * and is being addressed in the Go runtime in this issue: - * https://code.google.com/p/go/issues/detail?id=8197 + * String table for loadable section headers. See vdso2c.h for why + * this exists. */ -#ifndef __x86_64__ -#error This hack is specific to the 64-bit vDSO -#endif - -#include - -extern const __visible struct elf64_shdr vdso_fake_sections[]; -const __visible struct elf64_shdr vdso_fake_sections[] = { - { - .sh_type = SHT_DYNSYM, - .sh_entsize = sizeof(Elf64_Sym), - } -}; +const char fake_shstrtab[] __attribute__((section(".fake_shstrtab"))) = + ".hash\0" + ".dynsym\0" + ".dynstr\0" + ".gnu.version\0" + ".gnu.version_d\0" + ".dynamic\0" + ".rodata\0" + ".fake_shstrtab\0" /* Yay, self-referential code. */ + ".note\0" + ".data\0" + ".altinstructions\0" + ".altinstr_replacement\0" + ".eh_frame_hdr\0" + ".eh_frame\0" + ".text"; diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S index c84166c..e4cbc21 100644 --- a/arch/x86/vdso/vdso-layout.lds.S +++ b/arch/x86/vdso/vdso-layout.lds.S @@ -6,6 +6,16 @@ * This script controls its layout. */ +#if defined(BUILD_VDSO64) +# define SHDR_SIZE 64 +#elif defined(BUILD_VDSO32) || defined(BUILD_VDSOX32) +# define SHDR_SIZE 40 +#else +# error unknown VDSO target +#endif + +#define NUM_FAKE_SHDRS 16 + SECTIONS { . = SIZEOF_HEADERS; @@ -25,15 +35,29 @@ SECTIONS .dynamic : { *(.dynamic) } :text :dynamic - .rodata : { *(.rodata*) } :text + .rodata : { + *(.rodata*) + + /* + * Ideally this would live in a C file, but that won't + * work cleanly for x32 until we start building the x32 + * C code using an x32 toolchain. + */ + VDSO_FAKE_SECTION_TABLE_START = .; + . = . + NUM_FAKE_SHDRS * SHDR_SIZE; + VDSO_FAKE_SECTION_TABLE_END = .; + } :text + + .fake_shstrtab : { *(.fake_shstrtab) } :text + .data : { - *(.data*) - *(.sdata*) - *(.got.plt) *(.got) - *(.gnu.linkonce.d.*) - *(.bss*) - *(.dynbss*) - *(.gnu.linkonce.b.*) + *(.data*) + *(.sdata*) + *(.got.plt) *(.got) + *(.gnu.linkonce.d.*) + *(.bss*) + *(.dynbss*) + *(.gnu.linkonce.b.*) } .altinstructions : { *(.altinstructions) } diff --git a/arch/x86/vdso/vdso.lds.S b/arch/x86/vdso/vdso.lds.S index 75e3404..6807932 100644 --- a/arch/x86/vdso/vdso.lds.S +++ b/arch/x86/vdso/vdso.lds.S @@ -6,6 +6,8 @@ * the DSO. */ +#define BUILD_VDSO64 + #include "vdso-layout.lds.S" /* diff --git a/arch/x86/vdso/vdso2c.c b/arch/x86/vdso/vdso2c.c index 7343899..238dbe82 100644 --- a/arch/x86/vdso/vdso2c.c +++ b/arch/x86/vdso/vdso2c.c @@ -23,6 +23,8 @@ enum { sym_vvar_page, sym_hpet_page, sym_end_mapping, + sym_VDSO_FAKE_SECTION_TABLE_START, + sym_VDSO_FAKE_SECTION_TABLE_END, }; const int special_pages[] = { @@ -30,15 +32,26 @@ const int special_pages[] = { sym_hpet_page, }; -char const * const required_syms[] = { - [sym_vvar_page] = "vvar_page", - [sym_hpet_page] = "hpet_page", - [sym_end_mapping] = "end_mapping", - "VDSO32_NOTE_MASK", - "VDSO32_SYSENTER_RETURN", - "__kernel_vsyscall", - "__kernel_sigreturn", - "__kernel_rt_sigreturn", +struct vdso_sym { + const char *name; + bool export; +}; + +struct vdso_sym required_syms[] = { + [sym_vvar_page] = {"vvar_page", true}, + [sym_hpet_page] = {"hpet_page", true}, + [sym_end_mapping] = {"end_mapping", true}, + [sym_VDSO_FAKE_SECTION_TABLE_START] = { + "VDSO_FAKE_SECTION_TABLE_START", false + }, + [sym_VDSO_FAKE_SECTION_TABLE_END] = { + "VDSO_FAKE_SECTION_TABLE_END", false + }, + {"VDSO32_NOTE_MASK", true}, + {"VDSO32_SYSENTER_RETURN", true}, + {"__kernel_vsyscall", true}, + {"__kernel_sigreturn", true}, + {"__kernel_rt_sigreturn", true}, }; __attribute__((format(printf, 1, 2))) __attribute__((noreturn)) diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h index 8e185ce3..f01ed4b 100644 --- a/arch/x86/vdso/vdso2c.h +++ b/arch/x86/vdso/vdso2c.h @@ -4,6 +4,116 @@ * are built for 32-bit userspace. */ +/* + * We're writing a section table for a few reasons: + * + * The Go runtime had a couple of bugs: it would read the section + * table to try to figure out how many dynamic symbols there were (it + * shouldn't have looked at the section table at all) and, if there + * were no SHT_SYNDYM section table entry, it would use an + * uninitialized value for the number of symbols. An empty DYNSYM + * table would work, but I see no reason not to write a valid one (and + * keep full performance for old Go programs). This hack is only + * needed on x86_64. + * + * The bug was introduced on 2012-08-31 by: + * https://code.google.com/p/go/source/detail?r=56ea40aac72b + * and was fixed on 2014-06-13 by: + * https://code.google.com/p/go/source/detail?r=fc1cd5e12595 + * + * Binutils has issues debugging the vDSO: it reads the section table to + * find SHT_NOTE; it won't look at PT_NOTE for the in-memory vDSO, which + * would break build-id if we removed the section table. Binutils + * also requires that shstrndx != 0. See: + * https://sourceware.org/bugzilla/show_bug.cgi?id=17064 + * + * elfutils might not look for PT_NOTE if there is a section table at + * all. I don't know whether this matters for any practical purpose. + * + * For simplicity, rather than hacking up a partial section table, we + * just write a mostly complete one. We omit non-dynamic symbols, + * though, since they're rather large. + * + * Once binutils gets fixed, we might be able to drop this for all but + * the 64-bit vdso, since build-id only works in kernel RPMs, and + * systems that update to new enough kernel RPMs will likely update + * binutils in sync. build-id has never worked for home-built kernel + * RPMs without manual symlinking, and I suspect that no one ever does + * that. + */ +struct BITSFUNC(fake_sections) +{ + ELF(Shdr) *table; + unsigned long table_offset; + int count, max_count; + + int in_shstrndx; + unsigned long shstr_offset; + const char *shstrtab; + size_t shstrtab_len; + + int out_shstrndx; +}; + +static unsigned int BITSFUNC(find_shname)(struct BITSFUNC(fake_sections) *out, + const char *name) +{ + const char *outname = out->shstrtab; + while (outname - out->shstrtab < out->shstrtab_len) { + if (!strcmp(name, outname)) + return (outname - out->shstrtab) + out->shstr_offset; + outname += strlen(outname) + 1; + } + + if (*name) + printf("Warning: could not find output name \"%s\"\n", name); + return out->shstr_offset + out->shstrtab_len - 1; /* Use a null. */ +} + +static void BITSFUNC(init_sections)(struct BITSFUNC(fake_sections) *out) +{ + if (!out->in_shstrndx) + fail("didn't find the fake shstrndx\n"); + + memset(out->table, 0, out->max_count * sizeof(ELF(Shdr))); + + if (out->max_count < 1) + fail("we need at least two fake output sections\n"); + + PUT_LE(&out->table[0].sh_type, SHT_NULL); + PUT_LE(&out->table[0].sh_name, BITSFUNC(find_shname)(out, "")); + + out->count = 1; +} + +static void BITSFUNC(copy_section)(struct BITSFUNC(fake_sections) *out, + int in_idx, const ELF(Shdr) *in, + const char *name) +{ + uint64_t flags = GET_LE(&in->sh_flags); + + bool copy = flags & SHF_ALLOC; + + if (!copy) + return; + + if (out->count >= out->max_count) + fail("too many copied sections (max = %d)\n", out->max_count); + + if (in_idx == out->in_shstrndx) + out->out_shstrndx = out->count; + + out->table[out->count] = *in; + PUT_LE(&out->table[out->count].sh_name, + BITSFUNC(find_shname)(out, name)); + + /* elfutils requires that a strtab have the correct type. */ + if (!strcmp(name, ".fake_shstrtab")) + PUT_LE(&out->table[out->count].sh_type, SHT_STRTAB); + + out->count++; +} + static void BITSFUNC(go)(void *addr, size_t len, FILE *outfile, const char *name) { @@ -19,7 +129,7 @@ static void BITSFUNC(go)(void *addr, size_t len, const char *secstrings; uint64_t syms[NSYMS] = {}; - uint64_t fake_sections_value = 0, fake_sections_size = 0; + struct BITSFUNC(fake_sections) fake_sections = {}; ELF(Phdr) *pt = (ELF(Phdr) *)(addr + GET_LE(&hdr->e_phoff)); @@ -89,23 +199,57 @@ static void BITSFUNC(go)(void *addr, size_t len, GET_LE(&sym->st_name); for (k = 0; k < NSYMS; k++) { - if (!strcmp(name, required_syms[k])) { + if (!strcmp(name, required_syms[k].name)) { if (syms[k]) { fail("duplicate symbol %s\n", - required_syms[k]); + required_syms[k].name); } syms[k] = GET_LE(&sym->st_value); } } - if (!strcmp(name, "vdso_fake_sections")) { - if (fake_sections_value) - fail("duplicate vdso_fake_sections\n"); - fake_sections_value = GET_LE(&sym->st_value); - fake_sections_size = GET_LE(&sym->st_size); + if (!strcmp(name, "fake_shstrtab")) { + ELF(Shdr) *sh; + + fake_sections.in_shstrndx = GET_LE(&sym->st_shndx); + fake_sections.shstrtab = addr + GET_LE(&sym->st_value); + fake_sections.shstrtab_len = GET_LE(&sym->st_size); + sh = addr + GET_LE(&hdr->e_shoff) + + GET_LE(&hdr->e_shentsize) * + fake_sections.in_shstrndx; + fake_sections.shstr_offset = GET_LE(&sym->st_value) - + GET_LE(&sh->sh_addr); } } + /* Build the output section table. */ + if (!syms[sym_VDSO_FAKE_SECTION_TABLE_START] || + !syms[sym_VDSO_FAKE_SECTION_TABLE_END]) + fail("couldn't find fake section table\n"); + if ((syms[sym_VDSO_FAKE_SECTION_TABLE_END] - + syms[sym_VDSO_FAKE_SECTION_TABLE_START]) % sizeof(ELF(Shdr))) + fail("fake section table size isn't a multiple of sizeof(Shdr)\n"); + fake_sections.table = addr + syms[sym_VDSO_FAKE_SECTION_TABLE_START]; + fake_sections.table_offset = syms[sym_VDSO_FAKE_SECTION_TABLE_START]; + fake_sections.max_count = (syms[sym_VDSO_FAKE_SECTION_TABLE_END] - + syms[sym_VDSO_FAKE_SECTION_TABLE_START]) / + sizeof(ELF(Shdr)); + + BITSFUNC(init_sections)(&fake_sections); + for (i = 0; i < GET_LE(&hdr->e_shnum); i++) { + ELF(Shdr) *sh = addr + GET_LE(&hdr->e_shoff) + + GET_LE(&hdr->e_shentsize) * i; + BITSFUNC(copy_section)(&fake_sections, i, sh, + secstrings + GET_LE(&sh->sh_name)); + } + if (!fake_sections.out_shstrndx) + fail("didn't generate shstrndx?!?\n"); + + PUT_LE(&hdr->e_shoff, fake_sections.table_offset); + PUT_LE(&hdr->e_shentsize, sizeof(ELF(Shdr))); + PUT_LE(&hdr->e_shnum, fake_sections.count); + PUT_LE(&hdr->e_shstrndx, fake_sections.out_shstrndx); + /* Validate mapping addresses. */ for (i = 0; i < sizeof(special_pages) / sizeof(special_pages[0]); i++) { if (!syms[i]) @@ -113,25 +257,17 @@ static void BITSFUNC(go)(void *addr, size_t len, if (syms[i] % 4096) fail("%s must be a multiple of 4096\n", - required_syms[i]); + required_syms[i].name); if (syms[i] < data_size) fail("%s must be after the text mapping\n", - required_syms[i]); + required_syms[i].name); if (syms[sym_end_mapping] < syms[i] + 4096) - fail("%s overruns end_mapping\n", required_syms[i]); + fail("%s overruns end_mapping\n", + required_syms[i].name); } if (syms[sym_end_mapping] % 4096) fail("end_mapping must be a multiple of 4096\n"); - /* Remove sections or use fakes */ - if (fake_sections_size % sizeof(ELF(Shdr))) - fail("vdso_fake_sections size is not a multiple of %ld\n", - (long)sizeof(ELF(Shdr))); - PUT_LE(&hdr->e_shoff, fake_sections_value); - PUT_LE(&hdr->e_shentsize, fake_sections_value ? sizeof(ELF(Shdr)) : 0); - PUT_LE(&hdr->e_shnum, fake_sections_size / sizeof(ELF(Shdr))); - PUT_LE(&hdr->e_shstrndx, SHN_UNDEF); - if (!name) { fwrite(addr, load_size, 1, outfile); return; @@ -169,9 +305,9 @@ static void BITSFUNC(go)(void *addr, size_t len, (unsigned long)GET_LE(&alt_sec->sh_size)); } for (i = 0; i < NSYMS; i++) { - if (syms[i]) + if (required_syms[i].export && syms[i]) fprintf(outfile, "\t.sym_%s = 0x%" PRIx64 ",\n", - required_syms[i], syms[i]); + required_syms[i].name, syms[i]); } fprintf(outfile, "};\n"); } diff --git a/arch/x86/vdso/vdso32/vdso-fakesections.c b/arch/x86/vdso/vdso32/vdso-fakesections.c new file mode 100644 index 0000000..541468e --- /dev/null +++ b/arch/x86/vdso/vdso32/vdso-fakesections.c @@ -0,0 +1 @@ +#include "../vdso-fakesections.c" diff --git a/arch/x86/vdso/vdsox32.lds.S b/arch/x86/vdso/vdsox32.lds.S index 46b991b..697c11e 100644 --- a/arch/x86/vdso/vdsox32.lds.S +++ b/arch/x86/vdso/vdsox32.lds.S @@ -6,6 +6,8 @@ * the DSO. */ +#define BUILD_VDSOX32 + #include "vdso-layout.lds.S" /* -- cgit v1.1 From 0e3727a8839c988a3c56170bc8da76d55a16acad Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Wed, 18 Jun 2014 15:59:49 -0700 Subject: x86/vdso: Remove some redundant in-memory section headers .data doesn't need to be separate from .rodata: they're both readonly. .altinstructions and .altinstr_replacement aren't needed by anything except vdso2c; strip them from the final image. While we're at it, rather than aligning the actual executable text, just shove some unused-at-runtime data in between real data and text. My vdso image is still above 4k, but I'm disinclined to try to trim it harder for 3.16. For future trimming, I suspect that these sections could be moved to later in the file and dropped from the in-memory image: .gnu.version and .gnu.version_d (this may lose versions in gdb) .eh_frame (should be harmless) .eh_frame_hdr (I'm not really sure) .hash (AFAIK nothing needs this section header) Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/2e96d0c49016ea6d026a614ae645e93edd325961.1403129369.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vdso-fakesections.c | 3 --- arch/x86/vdso/vdso-layout.lds.S | 43 +++++++++++++++++++++------------------ arch/x86/vdso/vdso2c.h | 4 +++- 3 files changed, 26 insertions(+), 24 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso-fakesections.c b/arch/x86/vdso/vdso-fakesections.c index 56927a7..aa5fbfa 100644 --- a/arch/x86/vdso/vdso-fakesections.c +++ b/arch/x86/vdso/vdso-fakesections.c @@ -16,9 +16,6 @@ const char fake_shstrtab[] __attribute__((section(".fake_shstrtab"))) = ".rodata\0" ".fake_shstrtab\0" /* Yay, self-referential code. */ ".note\0" - ".data\0" - ".altinstructions\0" - ".altinstr_replacement\0" ".eh_frame_hdr\0" ".eh_frame\0" ".text"; diff --git a/arch/x86/vdso/vdso-layout.lds.S b/arch/x86/vdso/vdso-layout.lds.S index e4cbc21..9197544 100644 --- a/arch/x86/vdso/vdso-layout.lds.S +++ b/arch/x86/vdso/vdso-layout.lds.S @@ -14,7 +14,7 @@ # error unknown VDSO target #endif -#define NUM_FAKE_SHDRS 16 +#define NUM_FAKE_SHDRS 13 SECTIONS { @@ -28,15 +28,17 @@ SECTIONS .gnu.version_d : { *(.gnu.version_d) } .gnu.version_r : { *(.gnu.version_r) } - .note : { *(.note.*) } :text :note - - .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr - .eh_frame : { KEEP (*(.eh_frame)) } :text - .dynamic : { *(.dynamic) } :text :dynamic .rodata : { *(.rodata*) + *(.data*) + *(.sdata*) + *(.got.plt) *(.got) + *(.gnu.linkonce.d.*) + *(.bss*) + *(.dynbss*) + *(.gnu.linkonce.b.*) /* * Ideally this would live in a C file, but that won't @@ -50,28 +52,29 @@ SECTIONS .fake_shstrtab : { *(.fake_shstrtab) } :text - .data : { - *(.data*) - *(.sdata*) - *(.got.plt) *(.got) - *(.gnu.linkonce.d.*) - *(.bss*) - *(.dynbss*) - *(.gnu.linkonce.b.*) - } - .altinstructions : { *(.altinstructions) } - .altinstr_replacement : { *(.altinstr_replacement) } + .note : { *(.note.*) } :text :note + + .eh_frame_hdr : { *(.eh_frame_hdr) } :text :eh_frame_hdr + .eh_frame : { KEEP (*(.eh_frame)) } :text + /* - * Align the actual code well away from the non-instruction data. - * This is the best thing for the I-cache. + * Text is well-separated from actual data: there's plenty of + * stuff that isn't used at runtime in between. */ - . = ALIGN(0x100); .text : { *(.text*) } :text =0x90909090, /* + * At the end so that eu-elflint stays happy when vdso2c strips + * these. A better implementation would avoid allocating space + * for these. + */ + .altinstructions : { *(.altinstructions) } :text + .altinstr_replacement : { *(.altinstr_replacement) } :text + + /* * The remainder of the vDSO consists of special pages that are * shared between the kernel and userspace. It needs to be at the * end so that it doesn't overlap the mapping of the actual diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h index f01ed4b..f42e2dd 100644 --- a/arch/x86/vdso/vdso2c.h +++ b/arch/x86/vdso/vdso2c.h @@ -92,7 +92,9 @@ static void BITSFUNC(copy_section)(struct BITSFUNC(fake_sections) *out, { uint64_t flags = GET_LE(&in->sh_flags); - bool copy = flags & SHF_ALLOC; + bool copy = flags & SHF_ALLOC && + strcmp(name, ".altinstructions") && + strcmp(name, ".altinstr_replacement"); if (!copy) return; -- cgit v1.1 From dda1e95cee38b416b23f751cac65421d781e3c10 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Fri, 20 Jun 2014 12:20:44 -0700 Subject: x86/vdso: Create .build-id links for unstripped vdso files With this change, doing 'make vdso_install' and telling gdb: set debug-file-directory /lib/modules/KVER/vdso will enable vdso debugging with symbols. This is useful for testing, but kernel RPM builds will probably want to manually delete these symlinks or otherwise do something sensible when they strip the vdso/*.so files. If ld does not support --build-id, then the symlinks will not be created. Note that kernel packagers that use vdso_install may need to adjust their packaging scripts to accomdate this change. For example, Fedora's scripts create build-id symlinks themselves in a different location, so the spec should probably be updated to remove the symlinks created by make vdso_install. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/a424b189ce3ced85fe1e82d032a20e765e0fe0d3.1403291930.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/Makefile | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 2c1ca98..68a15c4 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -169,14 +169,24 @@ quiet_cmd_vdso = VDSO $@ sh $(srctree)/$(src)/checkundef.sh '$(NM)' '$@' VDSO_LDFLAGS = -fPIC -shared $(call cc-ldoption, -Wl$(comma)--hash-style=sysv) \ - -Wl,-Bsymbolic $(LTO_CFLAGS) + $(call cc-ldoption, -Wl$(comma)--build-id) -Wl,-Bsymbolic $(LTO_CFLAGS) GCOV_PROFILE := n # -# Install the unstripped copies of vdso*.so. +# Install the unstripped copies of vdso*.so. If our toolchain supports +# build-id, install .build-id links as well. # quiet_cmd_vdso_install = INSTALL $(@:install_%=%) - cmd_vdso_install = cp $< $(MODLIB)/vdso/$(@:install_%=%) +define cmd_vdso_install + cp $< "$(MODLIB)/vdso/$(@:install_%=%)"; \ + if readelf -n $< |grep -q 'Build ID'; then \ + buildid=`readelf -n $< |grep 'Build ID' |sed -e 's/^.*Build ID: \(.*\)$$/\1/'`; \ + first=`echo $$buildid | cut -b-2`; \ + last=`echo $$buildid | cut -b3-`; \ + mkdir -p "$(MODLIB)/vdso/.build-id/$$first"; \ + ln -sf "../../$(@:install_%=%)" "$(MODLIB)/vdso/.build-id/$$first/$$last.debug"; \ + fi +endef vdso_img_insttargets := $(vdso_img_sodbg:%.dbg=install_%) -- cgit v1.1 From 554086d85e71f30abe46fc014fea31929a7c6a8a Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 23 Jun 2014 14:22:15 -0700 Subject: x86_32, entry: Do syscall exit work on badsys (CVE-2014-4508) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bad syscall nr paths are their own incomprehensible route through the entry control flow. Rearrange them to work just like syscalls that return -ENOSYS. This fixes an OOPS in the audit code when fast-path auditing is enabled and sysenter gets a bad syscall nr (CVE-2014-4508). This has probably been broken since Linux 2.6.27: af0575bba0 i386 syscall audit fast-path Cc: stable@vger.kernel.org Cc: Roland McGrath Reported-by: Toralf Förster Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/e09c499eade6fc321266dd6b54da7beb28d6991c.1403558229.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/kernel/entry_32.S | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index f0da82b..dbaa23e 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -423,9 +423,10 @@ sysenter_past_esp: jnz sysenter_audit sysenter_do_call: cmpl $(NR_syscalls), %eax - jae syscall_badsys + jae sysenter_badsys call *sys_call_table(,%eax,4) movl %eax,PT_EAX(%esp) +sysenter_after_call: LOCKDEP_SYS_EXIT DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF @@ -675,7 +676,12 @@ END(syscall_fault) syscall_badsys: movl $-ENOSYS,PT_EAX(%esp) - jmp resume_userspace + jmp syscall_exit +END(syscall_badsys) + +sysenter_badsys: + movl $-ENOSYS,PT_EAX(%esp) + jmp sysenter_after_call END(syscall_badsys) CFI_ENDPROC -- cgit v1.1 From 6ba19a670c8b604e9802d30b511e6a4778118592 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Sat, 21 Jun 2014 07:31:55 -0700 Subject: x86_32, signal: Fix vdso rt_sigreturn This commit: commit 6f121e548f83674ab4920a4e60afb58d4f61b829 Author: Andy Lutomirski Date: Mon May 5 12:19:34 2014 -0700 x86, vdso: Reimplement vdso.so preparation in build-time C Contained this obvious typo: - restorer = VDSO32_SYMBOL(current->mm->context.vdso, rt_sigreturn); + restorer = current->mm->context.vdso + + selected_vdso32->sym___kernel_sigreturn; Note the missing 'rt_' in the new code. Fix it. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/1eb40ad923acde2e18357ef2832867432e70ac42.1403361010.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/kernel/signal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index a0da58d..2851d63 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -363,7 +363,7 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, /* Set up to return from userspace. */ restorer = current->mm->context.vdso + - selected_vdso32->sym___kernel_sigreturn; + selected_vdso32->sym___kernel_rt_sigreturn; if (ksig->ka.sa.sa_flags & SA_RESTORER) restorer = ksig->ka.sa.sa_restorer; put_user_ex(restorer, &frame->pretcode); -- cgit v1.1 From f3aca3d09525f87731ba6b892c9b010570bc54b4 Mon Sep 17 00:00:00 2001 From: Aaron Tomlin Date: Mon, 23 Jun 2014 13:22:05 -0700 Subject: nmi: provide the option to issue an NMI back trace to every cpu but current Sometimes it is preferred not to use the trigger_all_cpu_backtrace() routine when one wants to avoid capturing a back trace for current. For instance if one was previously captured recently. This patch provides a new routine namely trigger_allbutself_cpu_backtrace() which offers the flexibility to issue an NMI to every cpu but current and capture a back trace accordingly. Patch x86 and sparc to support new routine. [dzickus@redhat.com: add stub in #else clause] [dzickus@redhat.com: don't print message in single processor case, wrap with get/put_cpu based on Oleg's suggestion] [sfr@canb.auug.org.au: undo C99ism] Signed-off-by: Aaron Tomlin Signed-off-by: Don Zickus Acked-by: David S. Miller Cc: Mateusz Guzik Cc: Oleg Nesterov Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/irq.h | 2 +- arch/x86/kernel/apic/hw_nmi.c | 18 ++++++++++++++---- 2 files changed, 15 insertions(+), 5 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/irq.h b/arch/x86/include/asm/irq.h index cb6cfcd..a80cbb8 100644 --- a/arch/x86/include/asm/irq.h +++ b/arch/x86/include/asm/irq.h @@ -43,7 +43,7 @@ extern int vector_used_by_percpu_irq(unsigned int vector); extern void init_ISA_irqs(void); #ifdef CONFIG_X86_LOCAL_APIC -void arch_trigger_all_cpu_backtrace(void); +void arch_trigger_all_cpu_backtrace(bool); #define arch_trigger_all_cpu_backtrace arch_trigger_all_cpu_backtrace #endif diff --git a/arch/x86/kernel/apic/hw_nmi.c b/arch/x86/kernel/apic/hw_nmi.c index c3fcb5d..6a1e71b 100644 --- a/arch/x86/kernel/apic/hw_nmi.c +++ b/arch/x86/kernel/apic/hw_nmi.c @@ -33,31 +33,41 @@ static DECLARE_BITMAP(backtrace_mask, NR_CPUS) __read_mostly; /* "in progress" flag of arch_trigger_all_cpu_backtrace */ static unsigned long backtrace_flag; -void arch_trigger_all_cpu_backtrace(void) +void arch_trigger_all_cpu_backtrace(bool include_self) { int i; + int cpu = get_cpu(); - if (test_and_set_bit(0, &backtrace_flag)) + if (test_and_set_bit(0, &backtrace_flag)) { /* * If there is already a trigger_all_cpu_backtrace() in progress * (backtrace_flag == 1), don't output double cpu dump infos. */ + put_cpu(); return; + } cpumask_copy(to_cpumask(backtrace_mask), cpu_online_mask); + if (!include_self) + cpumask_clear_cpu(cpu, to_cpumask(backtrace_mask)); - printk(KERN_INFO "sending NMI to all CPUs:\n"); - apic->send_IPI_all(NMI_VECTOR); + if (!cpumask_empty(to_cpumask(backtrace_mask))) { + pr_info("sending NMI to %s CPUs:\n", + (include_self ? "all" : "other")); + apic->send_IPI_mask(to_cpumask(backtrace_mask), NMI_VECTOR); + } /* Wait for up to 10 seconds for all CPUs to do the backtrace */ for (i = 0; i < 10 * 1000; i++) { if (cpumask_empty(to_cpumask(backtrace_mask))) break; mdelay(1); + touch_softlockup_watchdog(); } clear_bit(0, &backtrace_flag); smp_mb__after_atomic(); + put_cpu(); } static int -- cgit v1.1 From 46b57a76930f01ebf31230ed35af5beeccb5ad95 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 24 Jun 2014 13:46:52 -0700 Subject: x86/vdso: Move DISABLE_BRANCH_PROFILING into the vdso makefile DISABLE_BRANCH_PROFILING turns off branch profiling (i.e. a redefinition of 'if'). Branch profiling depends on a bunch of kernel-internal symbols and generates extra output sections, none of which are useful or functional in the vDSO. It's currently turned off for vclock_gettime.c, but vgetcpu.c also triggers branch profiling, so just turn it off in the makefile. This fixes the build on some configurations: the vdso could contain undefined symbols, and the fake section table overflowed due to ftrace's added sections. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/bf1ec29e03b2bbc081f6dcaefa64db1c3a83fb21.1403642755.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/Makefile | 4 +++- arch/x86/vdso/vclock_gettime.c | 3 --- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/Makefile b/arch/x86/vdso/Makefile index 68a15c4..61b04fe 100644 --- a/arch/x86/vdso/Makefile +++ b/arch/x86/vdso/Makefile @@ -66,7 +66,8 @@ $(obj)/vdso-image-%.c: $(obj)/vdso%.so.dbg $(obj)/vdso2c FORCE # CFL := $(PROFILING) -mcmodel=small -fPIC -O2 -fasynchronous-unwind-tables -m64 \ $(filter -g%,$(KBUILD_CFLAGS)) $(call cc-option, -fno-stack-protector) \ - -fno-omit-frame-pointer -foptimize-sibling-calls + -fno-omit-frame-pointer -foptimize-sibling-calls \ + -DDISABLE_BRANCH_PROFILING $(vobjs): KBUILD_CFLAGS += $(CFL) @@ -149,6 +150,7 @@ KBUILD_CFLAGS_32 += -m32 -msoft-float -mregparm=0 -fpic KBUILD_CFLAGS_32 += $(call cc-option, -fno-stack-protector) KBUILD_CFLAGS_32 += $(call cc-option, -foptimize-sibling-calls) KBUILD_CFLAGS_32 += -fno-omit-frame-pointer +KBUILD_CFLAGS_32 += -DDISABLE_BRANCH_PROFILING $(vdso32-images:%=$(obj)/%.dbg): KBUILD_CFLAGS = $(KBUILD_CFLAGS_32) $(vdso32-images:%=$(obj)/%.dbg): $(obj)/vdso32-%.so.dbg: FORCE \ diff --git a/arch/x86/vdso/vclock_gettime.c b/arch/x86/vdso/vclock_gettime.c index b2e4f49..9793322 100644 --- a/arch/x86/vdso/vclock_gettime.c +++ b/arch/x86/vdso/vclock_gettime.c @@ -11,9 +11,6 @@ * Check with readelf after changing. */ -/* Disable profiling for userspace code: */ -#define DISABLE_BRANCH_PROFILING - #include #include #include -- cgit v1.1 From 6a89d71078dad9b1c49ccdf1ffa656fbe36ccd1e Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 24 Jun 2014 13:46:53 -0700 Subject: x86/vdso: Error out in vdso2c if DT_RELA is present vdso2c was checking for various types of relocations to detect when the vdso had undefined symbols or was otherwise dependent on relocation at load time. Undefined symbols in the vdso would fail if accessed at runtime, and certain implementation errors (e.g. branch profiling or incorrect symbol visibilities) could result in data access through the GOT that requires relocations. This could be as simple as: extern char foo; return foo; Without some kind of visibility control, the compiler would assume that foo could be interposed at load time and would generate a relocation. x86-64 and x32 (as opposed to i386) use explicit-addent (RELA) instead of implicit-addent (REL) relocations for data access, and vdso2c forgot to detect those. Whether these bad relocations would actually fail at runtime depends on what the linker sticks in the unrelocated references. Nonetheless, these relocations have no business existing in the vDSO and should be fixed rather than silently ignored. This error could trigger on some configurations due to branch profiling. The previous patch fixed that. Signed-off-by: Andy Lutomirski Link: http://lkml.kernel.org/r/74ef0c00b4d2a3b573e00a4113874e62f772e348.1403642755.git.luto@amacapital.net Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vdso2c.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h index f42e2dd..df95a2f 100644 --- a/arch/x86/vdso/vdso2c.h +++ b/arch/x86/vdso/vdso2c.h @@ -164,7 +164,7 @@ static void BITSFUNC(go)(void *addr, size_t len, for (i = 0; dyn + i < dyn_end && GET_LE(&dyn[i].d_tag) != DT_NULL; i++) { typeof(dyn[i].d_tag) tag = GET_LE(&dyn[i].d_tag); - if (tag == DT_REL || tag == DT_RELSZ || + if (tag == DT_REL || tag == DT_RELSZ || tag == DT_RELA || tag == DT_RELENT || tag == DT_TEXTREL) fail("vdso image contains dynamic relocations\n"); } -- cgit v1.1 From cfe82d4f45c7cc39332a2be7c4c1d3bf279bbd3d Mon Sep 17 00:00:00 2001 From: Jussi Kivilinna Date: Mon, 23 Jun 2014 19:41:05 +0300 Subject: crypto: sha512_ssse3 - fix byte count to bit count conversion Byte-to-bit-count computation is only partly converted to big-endian and is mixing in CPU-endian values. Problem was noticed by sparce with warning: CHECK arch/x86/crypto/sha512_ssse3_glue.c arch/x86/crypto/sha512_ssse3_glue.c:144:19: warning: restricted __be64 degrades to integer arch/x86/crypto/sha512_ssse3_glue.c:144:17: warning: incorrect type in assignment (different base types) arch/x86/crypto/sha512_ssse3_glue.c:144:17: expected restricted __be64 arch/x86/crypto/sha512_ssse3_glue.c:144:17: got unsigned long long Cc: Signed-off-by: Jussi Kivilinna Acked-by: Tim Chen Signed-off-by: Herbert Xu --- arch/x86/crypto/sha512_ssse3_glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/crypto/sha512_ssse3_glue.c b/arch/x86/crypto/sha512_ssse3_glue.c index f30cd10..8626b03 100644 --- a/arch/x86/crypto/sha512_ssse3_glue.c +++ b/arch/x86/crypto/sha512_ssse3_glue.c @@ -141,7 +141,7 @@ static int sha512_ssse3_final(struct shash_desc *desc, u8 *out) /* save number of bits */ bits[1] = cpu_to_be64(sctx->count[0] << 3); - bits[0] = cpu_to_be64(sctx->count[1] << 3) | sctx->count[0] >> 61; + bits[0] = cpu_to_be64(sctx->count[1] << 3 | sctx->count[0] >> 61); /* Pad out to 112 mod 128 and append length */ index = sctx->count[0] & 0x7f; -- cgit v1.1 From 33b458d276bbdbe28febac0742835002b9f4778d Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 29 Jun 2014 17:12:43 +0200 Subject: KVM: SVM: Fix CPL export via SS.DPL We import the CPL via SS.DPL since ae9fedc793. However, we fail to export it this way so far. This caused spurious guest crashes, e.g. of Linux when accessing the vmport from guest user space which triggered register saving/restoring to/from host user space. Signed-off-by: Jan Kiszka Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm.c | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index ec8366c..b5e994a 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -1462,6 +1462,7 @@ static void svm_get_segment(struct kvm_vcpu *vcpu, */ if (var->unusable) var->db = 0; + var->dpl = to_svm(vcpu)->vmcb->save.cpl; break; } } -- cgit v1.1 From 3896c329df8092661dac80f55a8c3f60136fd61a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Jun 2014 14:48:19 +0200 Subject: x86, tsc: Fix cpufreq lockup Mauro reported that his AMD X2 using the powernow-k8 cpufreq driver locked up when doing cpu hotplug. Because we called set_cyc2ns_scale() from the time_cpufreq_notifier() unconditionally, it gets called multiple times for each freq change, instead of only the once, when the tsc_khz value actually changes. Because it gets called more than once, we run out of cyc2ns data slots and stall, waiting for a free one, but because we're half way offline, there's no consumers to free slots. By placing the call inside the condition that actually changes tsc_khz we avoid superfluous calls and avoid the problem. Reported-by: Mauro Tested-by: Mauro Fixes: 20d1c86a5776 ("sched/clock, x86: Rewrite cyc2ns() to avoid the need to disable IRQs") Cc: Cc: "Rafael J. Wysocki" Cc: Viresh Kumar Cc: Bin Gao Cc: Linus Torvalds Cc: Mika Westerberg Cc: Paul Gortmaker Cc: Stefani Seibold Cc: linux-kernel@vger.kernel.org Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 57e5ce1..ea03031 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -920,9 +920,9 @@ static int time_cpufreq_notifier(struct notifier_block *nb, unsigned long val, tsc_khz = cpufreq_scale(tsc_khz_ref, ref_freq, freq->new); if (!(freq->flags & CPUFREQ_CONST_LOOPS)) mark_tsc_unstable("cpufreq changes"); - } - set_cyc2ns_scale(tsc_khz, freq->cpu); + set_cyc2ns_scale(tsc_khz, freq->cpu); + } return 0; } -- cgit v1.1 From b292d7a10487aee6e74b1c18b8d95b92f40d4a4f Mon Sep 17 00:00:00 2001 From: HATAYAMA Daisuke Date: Wed, 25 Jun 2014 10:09:07 +0900 Subject: perf/x86/intel: ignore CondChgd bit to avoid false NMI handling MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently, any NMI is falsely handled by a NMI handler of NMI watchdog if CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR is set. For example, we use external NMI to make system panic to get crash dump, but in this case, the external NMI is falsely handled do to the issue. This commit deals with the issue simply by ignoring CondChgd bit. Here is explanation in detail. On x86 NMI watchdog uses performance monitoring feature to periodically signal NMI each time performance counter gets overflowed. intel_pmu_handle_irq() is called as a NMI_LOCAL handler from a NMI handler of NMI watchdog, perf_event_nmi_handler(). It identifies an owner of a given NMI by looking at overflow status bits in MSR_CORE_PERF_GLOBAL_STATUS MSR. If some of the bits are set, then it handles the given NMI as its own NMI. The problem is that the intel_pmu_handle_irq() doesn't distinguish CondChgd bit from other bits. Unlike the other status bits, CondChgd bit doesn't represent overflow status for performance counters. Thus, CondChgd bit cannot be thought of as a mark indicating a given NMI is NMI watchdog's. As a result, if CondChgd bit is set, any NMI is falsely handled by the NMI handler of NMI watchdog. Also, if type of the falsely handled NMI is either NMI_UNKNOWN, NMI_SERR or NMI_IO_CHECK, the corresponding action is never performed until CondChgd bit is cleared. I noticed this behavior on systems with Ivy Bridge processors: Intel Xeon CPU E5-2630 v2 and Intel Xeon CPU E7-8890 v2. On both systems, CondChgd bit in MSR_CORE_PERF_GLOBAL_STATUS MSR has already been set in the beginning at boot. Then the CondChgd bit is immediately cleared by next wrmsr to MSR_CORE_PERF_GLOBAL_CTRL MSR and appears to remain 0. On the other hand, on older processors such as Nehalem, Xeon E7540, CondChgd bit is not set in the beginning at boot. I'm not sure about exact behavior of CondChgd bit, in particular when this bit is set. Although I read Intel System Programmer's Manual to figure out that, the descriptions I found are: In 18.9.1: "The MSR_PERF_GLOBAL_STATUS MSR also provides a ¡sticky bit¢ to indicate changes to the state of performancmonitoring hardware" In Table 35-2 IA-32 Architectural MSRs 63 CondChg: status bits of this register has changed. These are different from the bahviour I see on the actual system as I explained above. At least, I think ignoring CondChgd bit should be enough for NMI watchdog perspective. Signed-off-by: HATAYAMA Daisuke Acked-by: Don Zickus Signed-off-by: Peter Zijlstra Cc: Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: linux-kernel@vger.kernel.org Link: http://lkml.kernel.org/r/20140625.103503.409316067.d.hatayama@jp.fujitsu.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index adb02aa..07846d7 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1382,6 +1382,15 @@ again: intel_pmu_lbr_read(); /* + * CondChgd bit 63 doesn't mean any overflow status. Ignore + * and clear the bit. + */ + if (__test_and_clear_bit(63, (unsigned long *)&status)) { + if (!status) + goto done; + } + + /* * PEBS overflow sets bit 62 in the global status register */ if (__test_and_clear_bit(62, (unsigned long *)&status)) { -- cgit v1.1 From b9cd18de4db3c9ffa7e17b0dc0ca99ed5aa4d43a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 3 Jul 2014 15:43:15 -0400 Subject: ptrace,x86: force IRET path after a ptrace_stop() The 'sysret' fastpath does not correctly restore even all regular registers, much less any segment registers or reflags values. That is very much part of why it's faster than 'iret'. Normally that isn't a problem, because the normal ptrace() interface catches the process using the signal handler infrastructure, which always returns with an iret. However, some paths can get caught using ptrace_event() instead of the signal path, and for those we need to make sure that we aren't going to return to user space using 'sysret'. Otherwise the modifications that may have been done to the register set by the tracer wouldn't necessarily take effect. Fix it by forcing IRET path by setting TIF_NOTIFY_RESUME from arch_ptrace_stop_needed() which is invoked from ptrace_stop(). Signed-off-by: Tejun Heo Reported-by: Andy Lutomirski Acked-by: Oleg Nesterov Suggested-by: Linus Torvalds Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- arch/x86/include/asm/ptrace.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 14fd6fd..6205f0c 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -231,6 +231,22 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, #define ARCH_HAS_USER_SINGLE_STEP_INFO +/* + * When hitting ptrace_stop(), we cannot return using SYSRET because + * that does not restore the full CPU state, only a minimal set. The + * ptracer can change arbitrary register values, which is usually okay + * because the usual ptrace stops run off the signal delivery path which + * forces IRET; however, ptrace_event() stops happen in arbitrary places + * in the kernel and don't force IRET path. + * + * So force IRET path after a ptrace stop. + */ +#define arch_ptrace_stop_needed(code, info) \ +({ \ + set_thread_flag(TIF_NOTIFY_RESUME); \ + false; \ +}) + struct user_desc; extern int do_get_thread_area(struct task_struct *p, int idx, struct user_desc __user *info); -- cgit v1.1 From c7fb93ec51d462ec3540a729ba446663c26a0505 Mon Sep 17 00:00:00 2001 From: Michael Brown Date: Thu, 10 Jul 2014 12:26:20 +0100 Subject: x86/efi: Include a .bss section within the PE/COFF headers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The PE/COFF headers currently describe only the initialised-data portions of the image, and result in no space being allocated for the uninitialised-data portions. Consequently, the EFI boot stub will end up overwriting unexpected areas of memory, with unpredictable results. Fix by including a .bss section in the PE/COFF headers (functionally equivalent to the init_size field in the bzImage header). Signed-off-by: Michael Brown Cc: Thomas Bächler Cc: Josh Boyer Cc: Signed-off-by: Matt Fleming --- arch/x86/boot/header.S | 26 ++++++++++++++++++++++---- arch/x86/boot/tools/build.c | 38 ++++++++++++++++++++++++++++++-------- 2 files changed, 52 insertions(+), 12 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 84c2234..7a6d43a 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -91,10 +91,9 @@ bs_die: .section ".bsdata", "a" bugger_off_msg: - .ascii "Direct floppy boot is not supported. " - .ascii "Use a boot loader program instead.\r\n" + .ascii "Use a boot loader.\r\n" .ascii "\n" - .ascii "Remove disk and press any key to reboot ...\r\n" + .ascii "Remove disk and press any key to reboot...\r\n" .byte 0 #ifdef CONFIG_EFI_STUB @@ -108,7 +107,7 @@ coff_header: #else .word 0x8664 # x86-64 #endif - .word 3 # nr_sections + .word 4 # nr_sections .long 0 # TimeDateStamp .long 0 # PointerToSymbolTable .long 1 # NumberOfSymbols @@ -250,6 +249,25 @@ section_table: .word 0 # NumberOfLineNumbers .long 0x60500020 # Characteristics (section flags) + # + # The offset & size fields are filled in by build.c. + # + .ascii ".bss" + .byte 0 + .byte 0 + .byte 0 + .byte 0 + .long 0 + .long 0x0 + .long 0 # Size of initialized data + # on disk + .long 0x0 + .long 0 # PointerToRelocations + .long 0 # PointerToLineNumbers + .word 0 # NumberOfRelocations + .word 0 # NumberOfLineNumbers + .long 0xc8000080 # Characteristics (section flags) + #endif /* CONFIG_EFI_STUB */ # Kernel attributes; used by setup. This is part 1 of the diff --git a/arch/x86/boot/tools/build.c b/arch/x86/boot/tools/build.c index 1a2f212..a7661c4 100644 --- a/arch/x86/boot/tools/build.c +++ b/arch/x86/boot/tools/build.c @@ -143,7 +143,7 @@ static void usage(void) #ifdef CONFIG_EFI_STUB -static void update_pecoff_section_header(char *section_name, u32 offset, u32 size) +static void update_pecoff_section_header_fields(char *section_name, u32 vma, u32 size, u32 datasz, u32 offset) { unsigned int pe_header; unsigned short num_sections; @@ -164,10 +164,10 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz put_unaligned_le32(size, section + 0x8); /* section header vma field */ - put_unaligned_le32(offset, section + 0xc); + put_unaligned_le32(vma, section + 0xc); /* section header 'size of initialised data' field */ - put_unaligned_le32(size, section + 0x10); + put_unaligned_le32(datasz, section + 0x10); /* section header 'file offset' field */ put_unaligned_le32(offset, section + 0x14); @@ -179,6 +179,11 @@ static void update_pecoff_section_header(char *section_name, u32 offset, u32 siz } } +static void update_pecoff_section_header(char *section_name, u32 offset, u32 size) +{ + update_pecoff_section_header_fields(section_name, offset, size, size, offset); +} + static void update_pecoff_setup_and_reloc(unsigned int size) { u32 setup_offset = 0x200; @@ -203,9 +208,6 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz) pe_header = get_unaligned_le32(&buf[0x3c]); - /* Size of image */ - put_unaligned_le32(file_sz, &buf[pe_header + 0x50]); - /* * Size of code: Subtract the size of the first sector (512 bytes) * which includes the header. @@ -220,6 +222,22 @@ static void update_pecoff_text(unsigned int text_start, unsigned int file_sz) update_pecoff_section_header(".text", text_start, text_sz); } +static void update_pecoff_bss(unsigned int file_sz, unsigned int init_sz) +{ + unsigned int pe_header; + unsigned int bss_sz = init_sz - file_sz; + + pe_header = get_unaligned_le32(&buf[0x3c]); + + /* Size of uninitialized data */ + put_unaligned_le32(bss_sz, &buf[pe_header + 0x24]); + + /* Size of image */ + put_unaligned_le32(init_sz, &buf[pe_header + 0x50]); + + update_pecoff_section_header_fields(".bss", file_sz, bss_sz, 0, 0); +} + static int reserve_pecoff_reloc_section(int c) { /* Reserve 0x20 bytes for .reloc section */ @@ -259,6 +277,8 @@ static void efi_stub_entry_update(void) static inline void update_pecoff_setup_and_reloc(unsigned int size) {} static inline void update_pecoff_text(unsigned int text_start, unsigned int file_sz) {} +static inline void update_pecoff_bss(unsigned int file_sz, + unsigned int init_sz) {} static inline void efi_stub_defaults(void) {} static inline void efi_stub_entry_update(void) {} @@ -310,7 +330,7 @@ static void parse_zoffset(char *fname) int main(int argc, char ** argv) { - unsigned int i, sz, setup_sectors; + unsigned int i, sz, setup_sectors, init_sz; int c; u32 sys_size; struct stat sb; @@ -376,7 +396,9 @@ int main(int argc, char ** argv) buf[0x1f1] = setup_sectors-1; put_unaligned_le32(sys_size, &buf[0x1f4]); - update_pecoff_text(setup_sectors * 512, sz + i + ((sys_size * 16) - sz)); + update_pecoff_text(setup_sectors * 512, i + (sys_size * 16)); + init_sz = get_unaligned_le32(&buf[0x260]); + update_pecoff_bss(i + (sys_size * 16), init_sz); efi_stub_entry_update(); -- cgit v1.1 From 9f88b906b4455465d60ac18b8c95904f320038d5 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 3 Jul 2014 15:34:38 +0100 Subject: x86-64, vdso: Fix vDSO build breakage due to empty .rela.dyn Certain ld versions (observed with 2.20.0) put an empty .rela.dyn section into shared object files, breaking the assumption on the number of sections to be copied to the final output. Simply discard any empty SHT_REL and SHT_RELA sections to address this. Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/53B5861E02000078000204D1@mail.emea.novell.com Acked-by: Andy Lutomirski Tested-by: Boris Ostrovsky Tested-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vdso2c.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vdso2c.h b/arch/x86/vdso/vdso2c.h index df95a2f..11b65d4 100644 --- a/arch/x86/vdso/vdso2c.h +++ b/arch/x86/vdso/vdso2c.h @@ -93,6 +93,9 @@ static void BITSFUNC(copy_section)(struct BITSFUNC(fake_sections) *out, uint64_t flags = GET_LE(&in->sh_flags); bool copy = flags & SHF_ALLOC && + (GET_LE(&in->sh_size) || + (GET_LE(&in->sh_type) != SHT_RELA && + GET_LE(&in->sh_type) != SHT_REL)) && strcmp(name, ".altinstructions") && strcmp(name, ".altinstr_replacement"); -- cgit v1.1 From d093601be5e97d2729614419d0d256ed3b6a56b0 Mon Sep 17 00:00:00 2001 From: Jan Beulich Date: Thu, 3 Jul 2014 15:35:07 +0100 Subject: x86-32, vdso: Fix vDSO build error due to missing align_vdso_addr() Relying on static functions used just once to get inlined (and subsequently have dead code paths eliminated) is wrong: Compilers are free to decide whether they do this, regardless of optimization level. With this not happening for vdso_addr() (observed with gcc 4.1.x), an unresolved reference to align_vdso_addr() causes the build to fail. [ hpa: vdso_addr() is never actually used on x86-32, as calculate_addr in map_vdso() is always false. It ought to be possible to clean this up further, but this fixes the immediate problem. ] Signed-off-by: Jan Beulich Link: http://lkml.kernel.org/r/53B5863B02000078000204D5@mail.emea.novell.com Acked-by: Andy Lutomirski Tested-by: Boris Ostrovsky Tested-by: Andrew Morton Signed-off-by: H. Peter Anvin --- arch/x86/vdso/vma.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index e1513c4..5a5176d 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -62,6 +62,9 @@ struct linux_binprm; Only used for the 64-bit and x32 vdsos. */ static unsigned long vdso_addr(unsigned long start, unsigned len) { +#ifdef CONFIG_X86_32 + return 0; +#else unsigned long addr, end; unsigned offset; end = (start + PMD_SIZE - 1) & PMD_MASK; @@ -83,6 +86,7 @@ static unsigned long vdso_addr(unsigned long start, unsigned len) addr = align_vdso_addr(addr); return addr; +#endif } static int map_vdso(const struct vdso_image *image, bool calculate_addr) -- cgit v1.1 From 8762e5092828c4dc0f49da5a47a644c670df77f3 Mon Sep 17 00:00:00 2001 From: Boris Ostrovsky Date: Wed, 9 Jul 2014 13:18:18 -0400 Subject: x86/espfix/xen: Fix allocation of pages for paravirt page tables init_espfix_ap() is currently off by one level when informing hypervisor that allocated pages will be used for ministacks' page tables. The most immediate effect of this on a PV guest is that if 'stack_page = __get_free_page()' returns a non-zeroed-out page the hypervisor will refuse to use it for a page table (which it shouldn't be anyway). This will result in warnings by both Xen and Linux. More importantly, a subsequent write to that page (again, by a PV guest) is likely to result in fatal page fault. Signed-off-by: Boris Ostrovsky Link: http://lkml.kernel.org/r/1404926298-5565-1-git-send-email-boris.ostrovsky@oracle.com Reviewed-by: Konrad Rzeszutek Wilk Signed-off-by: H. Peter Anvin Cc: --- arch/x86/kernel/espfix_64.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/espfix_64.c b/arch/x86/kernel/espfix_64.c index 6afbb16..94d857f 100644 --- a/arch/x86/kernel/espfix_64.c +++ b/arch/x86/kernel/espfix_64.c @@ -175,7 +175,7 @@ void init_espfix_ap(void) if (!pud_present(pud)) { pmd_p = (pmd_t *)__get_free_page(PGALLOC_GFP); pud = __pud(__pa(pmd_p) | (PGTABLE_PROT & ptemask)); - paravirt_alloc_pud(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); + paravirt_alloc_pmd(&init_mm, __pa(pmd_p) >> PAGE_SHIFT); for (n = 0; n < ESPFIX_PUD_CLONES; n++) set_pud(&pud_p[n], pud); } @@ -185,7 +185,7 @@ void init_espfix_ap(void) if (!pmd_present(pmd)) { pte_p = (pte_t *)__get_free_page(PGALLOC_GFP); pmd = __pmd(__pa(pte_p) | (PGTABLE_PROT & ptemask)); - paravirt_alloc_pmd(&init_mm, __pa(pte_p) >> PAGE_SHIFT); + paravirt_alloc_pte(&init_mm, __pa(pte_p) >> PAGE_SHIFT); for (n = 0; n < ESPFIX_PMD_CLONES; n++) set_pmd(&pmd_p[n], pmd); } @@ -193,7 +193,6 @@ void init_espfix_ap(void) pte_p = pte_offset_kernel(&pmd, addr); stack_page = (void *)__get_free_page(GFP_KERNEL); pte = __pte(__pa(stack_page) | (__PAGE_KERNEL_RO & ptemask)); - paravirt_alloc_pte(&init_mm, __pa(stack_page) >> PAGE_SHIFT); for (n = 0; n < ESPFIX_PTE_CLONES; n++) set_pte(&pte_p[n*PTE_STRIDE], pte); -- cgit v1.1 From d3f44fbabe55132832e152606365adb640296378 Mon Sep 17 00:00:00 2001 From: Paul Bolle Date: Mon, 30 Jun 2014 16:32:29 +0200 Subject: x86: Remove unused variable "polling" Compile tested. "polling" is unused since commit f80c5b39b80a ("sched/idle, x86: Switch from TS_POLLING to TIF_POLLING_NRFLAG"). Signed-off-by: Paul Bolle Signed-off-by: Peter Zijlstra Cc: Jiri Kosina Link: http://lkml.kernel.org/r/1404138749.2978.6.camel@x41 Signed-off-by: Ingo Molnar --- arch/x86/kernel/apm_32.c | 1 - 1 file changed, 1 deletion(-) (limited to 'arch/x86') diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c index f3a1f04..5848744 100644 --- a/arch/x86/kernel/apm_32.c +++ b/arch/x86/kernel/apm_32.c @@ -841,7 +841,6 @@ static int apm_do_idle(void) u32 eax; u8 ret = 0; int idled = 0; - int polling; int err = 0; if (!need_resched()) { -- cgit v1.1 From 4badad352a6bb202ec68afa7a574c0bb961e5ebc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 6 Jun 2014 19:53:16 +0200 Subject: locking/mutex: Disable optimistic spinning on some architectures The optimistic spin code assumes regular stores and cmpxchg() play nice; this is found to not be true for at least: parisc, sparc32, tile32, metag-lock1, arc-!llsc and hexagon. There is further wreckage, but this in particular seemed easy to trigger, so blacklist this. Opt in for known good archs. Signed-off-by: Peter Zijlstra Reported-by: Mikulas Patocka Cc: David Miller Cc: Chris Metcalf Cc: James Bottomley Cc: Vineet Gupta Cc: Jason Low Cc: Waiman Long Cc: "James E.J. Bottomley" Cc: Paul McKenney Cc: John David Anglin Cc: James Hogan Cc: Linus Torvalds Cc: Davidlohr Bueso Cc: stable@vger.kernel.org Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Russell King Cc: Will Deacon Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Cc: linuxppc-dev@lists.ozlabs.org Cc: sparclinux@vger.kernel.org Link: http://lkml.kernel.org/r/20140606175316.GV13930@laptop.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + 1 file changed, 1 insertion(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a8f749e..d24887b 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -131,6 +131,7 @@ config X86 select HAVE_CC_STACKPROTECTOR select GENERIC_CPU_AUTOPROBE select HAVE_ARCH_AUDITSYSCALL + select ARCH_SUPPORTS_ATOMIC_RMW config INSTRUCTION_DECODER def_bool y -- cgit v1.1 From 9dae3d0d9e64c3cb8bb172f041d4e66d4b92088a Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Tue, 22 Jul 2014 11:20:11 +0200 Subject: apei, mce: Factor out APEI architecture specific MCE calls. This commit abstracts MCE calls and provides weak corresponding default implementation for those architectures which do not need arch specific actions. Each platform willing to do additional architectural actions should provides desired function definition. It allows us to avoid wrap code into #ifdef in generic code and prevent new platform from introducing dummy stub function too. Initially, there are two APEI arch-specific calls: - arch_apei_enable_cmcff() - arch_apei_report_mem_error() Both interact with MCE driver for X86 architecture. Signed-off-by: Tomasz Nowicki Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- arch/x86/kernel/acpi/Makefile | 1 + arch/x86/kernel/acpi/apei.c | 56 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) create mode 100644 arch/x86/kernel/acpi/apei.c (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 163b225..3242e59 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,5 +1,6 @@ obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o +obj-$(CONFIG_ACPI_APEI) += apei.o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c new file mode 100644 index 0000000..12b13de --- /dev/null +++ b/arch/x86/kernel/acpi/apei.c @@ -0,0 +1,56 @@ +/* + * Arch-specific APEI-related functions. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + */ + +#include + +#include + +int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data) +{ +#ifdef CONFIG_X86_MCE + int i; + struct acpi_hest_ia_corrected *cmc; + struct acpi_hest_ia_error_bank *mc_bank; + + if (hest_hdr->type != ACPI_HEST_TYPE_IA32_CORRECTED_CHECK) + return 0; + + cmc = (struct acpi_hest_ia_corrected *)hest_hdr; + if (!cmc->enabled) + return 0; + + /* + * We expect HEST to provide a list of MC banks that report errors + * in firmware first mode. Otherwise, return non-zero value to + * indicate that we are done parsing HEST. + */ + if (!(cmc->flags & ACPI_HEST_FIRMWARE_FIRST) || + !cmc->num_hardware_banks) + return 1; + + pr_info("HEST: Enabling Firmware First mode for corrected errors.\n"); + + mc_bank = (struct acpi_hest_ia_error_bank *)(cmc + 1); + for (i = 0; i < cmc->num_hardware_banks; i++, mc_bank++) + mce_disable_bank(mc_bank->bank_number); +#endif + return 1; +} + +void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) +{ +#ifdef CONFIG_X86_MCE + apei_mce_report_mem_error(sev, mem_err); +#endif +} -- cgit v1.1 From 44a69f6195628f6f940566d133a72987559e102d Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Tue, 22 Jul 2014 11:20:12 +0200 Subject: acpi, apei, ghes: Make NMI error notification to be GHES architecture extension. Currently APEI depends on x86 architecture. It is because of NMI hardware error notification of GHES which is currently supported by x86 only. However, many other APEI features can be still used perfectly by other architectures. This commit adds two symbols: 1. HAVE_ACPI_APEI for those archs which support APEI. 2. HAVE_ACPI_APEI_NMI which is used for NMI code isolation in ghes.c file. NMI related data and functions are grouped so they can be wrapped inside one #ifdef section. Appropriate function stubs are provided for !NMI case. Note there is no functional changes for x86 due to hard selected HAVE_ACPI_APEI and HAVE_ACPI_APEI_NMI symbols. Signed-off-by: Tomasz Nowicki Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- arch/x86/Kconfig | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d24887b..4387344 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -132,6 +132,8 @@ config X86 select GENERIC_CPU_AUTOPROBE select HAVE_ARCH_AUDITSYSCALL select ARCH_SUPPORTS_ATOMIC_RMW + select HAVE_ACPI_APEI if ACPI + select HAVE_ACPI_APEI_NMI if ACPI config INSTRUCTION_DECODER def_bool y -- cgit v1.1 From 594c7255dce7a13cac50cf2470cc56e2c3b0494e Mon Sep 17 00:00:00 2001 From: Tomasz Nowicki Date: Tue, 22 Jul 2014 11:20:13 +0200 Subject: acpi, apei, ghes: Factor out ioremap virtual memory for IRQ and NMI context. GHES currently maps two pages with atomic_ioremap. From now on, NMI is architectural depended so there is no need to allocate an NMI page for platforms without NMI support. To make it possible to not use a second page, swap the existing page order so that the IRQ context page is first, and the optional NMI context page is second. Then, use HAVE_ACPI_APEI_NMI to decide how many pages are to be allocated. Signed-off-by: Tomasz Nowicki Acked-by: Borislav Petkov Signed-off-by: Tony Luck --- arch/x86/kernel/acpi/apei.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'arch/x86') diff --git a/arch/x86/kernel/acpi/apei.c b/arch/x86/kernel/acpi/apei.c index 12b13de..c280df6 100644 --- a/arch/x86/kernel/acpi/apei.c +++ b/arch/x86/kernel/acpi/apei.c @@ -15,6 +15,7 @@ #include #include +#include int arch_apei_enable_cmcff(struct acpi_hest_header *hest_hdr, void *data) { @@ -54,3 +55,8 @@ void arch_apei_report_mem_error(int sev, struct cper_sec_mem_err *mem_err) apei_mce_report_mem_error(sev, mem_err); #endif } + +void arch_apei_flush_tlb_one(unsigned long addr) +{ + __flush_tlb_one(addr); +} -- cgit v1.1