diff options
Diffstat (limited to 'arch/x86')
69 files changed, 1843 insertions, 1391 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 2a1f0ce..0cc8811 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -705,7 +705,6 @@ config PARAVIRT_DEBUG config PARAVIRT_SPINLOCKS bool "Paravirtualization layer for spinlocks" depends on PARAVIRT && SMP - select UNINLINE_SPIN_UNLOCK if !QUEUED_SPINLOCKS ---help--- Paravirtualized spinlocks allow a pvops backend to replace the spinlock implementation with something virtualization-friendly @@ -718,7 +717,7 @@ config PARAVIRT_SPINLOCKS config QUEUED_LOCK_STAT bool "Paravirt queued spinlock statistics" - depends on PARAVIRT_SPINLOCKS && DEBUG_FS && QUEUED_SPINLOCKS + depends on PARAVIRT_SPINLOCKS && DEBUG_FS ---help--- Enable the collection of statistical data on the slowpath behavior of paravirtualized queued spinlocks and report diff --git a/arch/x86/boot/compressed/eboot.c b/arch/x86/boot/compressed/eboot.c index ff574da..cc69e37 100644 --- a/arch/x86/boot/compressed/eboot.c +++ b/arch/x86/boot/compressed/eboot.c @@ -29,22 +29,11 @@ __pure const struct efi_config *__efi_early(void) static void setup_boot_services##bits(struct efi_config *c) \ { \ efi_system_table_##bits##_t *table; \ - efi_boot_services_##bits##_t *bt; \ \ table = (typeof(table))sys_table; \ \ + c->boot_services = table->boottime; \ c->text_output = table->con_out; \ - \ - bt = (typeof(bt))(unsigned long)(table->boottime); \ - \ - c->allocate_pool = bt->allocate_pool; \ - c->allocate_pages = bt->allocate_pages; \ - c->get_memory_map = bt->get_memory_map; \ - c->free_pool = bt->free_pool; \ - c->free_pages = bt->free_pages; \ - c->locate_handle = bt->locate_handle; \ - c->handle_protocol = bt->handle_protocol; \ - c->exit_boot_services = bt->exit_boot_services; \ } BOOT_SERVICES(32); BOOT_SERVICES(64); @@ -286,29 +275,6 @@ void efi_char16_printk(efi_system_table_t *table, efi_char16_t *str) } } -static void find_bits(unsigned long mask, u8 *pos, u8 *size) -{ - u8 first, len; - - first = 0; - len = 0; - - if (mask) { - while (!(mask & 0x1)) { - mask = mask >> 1; - first++; - } - - while (mask & 0x1) { - mask = mask >> 1; - len++; - } - } - - *pos = first; - *size = len; -} - static efi_status_t __setup_efi_pci32(efi_pci_io_protocol_32 *pci, struct pci_setup_rom **__rom) { @@ -578,7 +544,7 @@ setup_uga32(void **uga_handle, unsigned long size, u32 *width, u32 *height) efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID; unsigned long nr_ugas; u32 *handles = (u32 *)uga_handle;; - efi_status_t status; + efi_status_t status = EFI_INVALID_PARAMETER; int i; first_uga = NULL; @@ -623,7 +589,7 @@ setup_uga64(void **uga_handle, unsigned long size, u32 *width, u32 *height) efi_guid_t uga_proto = EFI_UGA_PROTOCOL_GUID; unsigned long nr_ugas; u64 *handles = (u64 *)uga_handle;; - efi_status_t status; + efi_status_t status = EFI_INVALID_PARAMETER; int i; first_uga = NULL; @@ -1004,79 +970,87 @@ static efi_status_t alloc_e820ext(u32 nr_desc, struct setup_data **e820ext, return status; } -static efi_status_t exit_boot(struct boot_params *boot_params, - void *handle, bool is64) -{ - struct efi_info *efi = &boot_params->efi_info; - unsigned long map_sz, key, desc_size; - efi_memory_desc_t *mem_map; +struct exit_boot_struct { + struct boot_params *boot_params; + struct efi_info *efi; struct setup_data *e820ext; - const char *signature; __u32 e820ext_size; - __u32 nr_desc, prev_nr_desc; - efi_status_t status; - __u32 desc_version; - bool called_exit = false; - u8 nr_entries; - int i; - - nr_desc = 0; - e820ext = NULL; - e820ext_size = 0; - -get_map: - status = efi_get_memory_map(sys_table, &mem_map, &map_sz, &desc_size, - &desc_version, &key); - - if (status != EFI_SUCCESS) - return status; - - prev_nr_desc = nr_desc; - nr_desc = map_sz / desc_size; - if (nr_desc > prev_nr_desc && - nr_desc > ARRAY_SIZE(boot_params->e820_map)) { - u32 nr_e820ext = nr_desc - ARRAY_SIZE(boot_params->e820_map); - - status = alloc_e820ext(nr_e820ext, &e820ext, &e820ext_size); - if (status != EFI_SUCCESS) - goto free_mem_map; + bool is64; +}; - efi_call_early(free_pool, mem_map); - goto get_map; /* Allocated memory, get map again */ +static efi_status_t exit_boot_func(efi_system_table_t *sys_table_arg, + struct efi_boot_memmap *map, + void *priv) +{ + static bool first = true; + const char *signature; + __u32 nr_desc; + efi_status_t status; + struct exit_boot_struct *p = priv; + + if (first) { + nr_desc = *map->buff_size / *map->desc_size; + if (nr_desc > ARRAY_SIZE(p->boot_params->e820_map)) { + u32 nr_e820ext = nr_desc - + ARRAY_SIZE(p->boot_params->e820_map); + + status = alloc_e820ext(nr_e820ext, &p->e820ext, + &p->e820ext_size); + if (status != EFI_SUCCESS) + return status; + } + first = false; } - signature = is64 ? EFI64_LOADER_SIGNATURE : EFI32_LOADER_SIGNATURE; - memcpy(&efi->efi_loader_signature, signature, sizeof(__u32)); + signature = p->is64 ? EFI64_LOADER_SIGNATURE : EFI32_LOADER_SIGNATURE; + memcpy(&p->efi->efi_loader_signature, signature, sizeof(__u32)); - efi->efi_systab = (unsigned long)sys_table; - efi->efi_memdesc_size = desc_size; - efi->efi_memdesc_version = desc_version; - efi->efi_memmap = (unsigned long)mem_map; - efi->efi_memmap_size = map_sz; + p->efi->efi_systab = (unsigned long)sys_table_arg; + p->efi->efi_memdesc_size = *map->desc_size; + p->efi->efi_memdesc_version = *map->desc_ver; + p->efi->efi_memmap = (unsigned long)*map->map; + p->efi->efi_memmap_size = *map->map_size; #ifdef CONFIG_X86_64 - efi->efi_systab_hi = (unsigned long)sys_table >> 32; - efi->efi_memmap_hi = (unsigned long)mem_map >> 32; + p->efi->efi_systab_hi = (unsigned long)sys_table_arg >> 32; + p->efi->efi_memmap_hi = (unsigned long)*map->map >> 32; #endif + return EFI_SUCCESS; +} + +static efi_status_t exit_boot(struct boot_params *boot_params, + void *handle, bool is64) +{ + unsigned long map_sz, key, desc_size, buff_size; + efi_memory_desc_t *mem_map; + struct setup_data *e820ext; + __u32 e820ext_size; + efi_status_t status; + __u32 desc_version; + struct efi_boot_memmap map; + struct exit_boot_struct priv; + + map.map = &mem_map; + map.map_size = &map_sz; + map.desc_size = &desc_size; + map.desc_ver = &desc_version; + map.key_ptr = &key; + map.buff_size = &buff_size; + priv.boot_params = boot_params; + priv.efi = &boot_params->efi_info; + priv.e820ext = NULL; + priv.e820ext_size = 0; + priv.is64 = is64; + /* Might as well exit boot services now */ - status = efi_call_early(exit_boot_services, handle, key); - if (status != EFI_SUCCESS) { - /* - * ExitBootServices() will fail if any of the event - * handlers change the memory map. In which case, we - * must be prepared to retry, but only once so that - * we're guaranteed to exit on repeated failures instead - * of spinning forever. - */ - if (called_exit) - goto free_mem_map; - - called_exit = true; - efi_call_early(free_pool, mem_map); - goto get_map; - } + status = efi_exit_boot_services(sys_table, handle, &map, &priv, + exit_boot_func); + if (status != EFI_SUCCESS) + return status; + e820ext = priv.e820ext; + e820ext_size = priv.e820ext_size; /* Historic? */ boot_params->alt_mem_k = 32 * 1024; @@ -1085,10 +1059,6 @@ get_map: return status; return EFI_SUCCESS; - -free_mem_map: - efi_call_early(free_pool, mem_map); - return status; } /* diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 1038524..fd0b6a2 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -82,7 +82,7 @@ ENTRY(efi_pe_entry) /* Relocate efi_config->call() */ leal efi32_config(%esi), %eax - add %esi, 88(%eax) + add %esi, 32(%eax) pushl %eax call make_boot_params @@ -108,7 +108,7 @@ ENTRY(efi32_stub_entry) /* Relocate efi_config->call() */ leal efi32_config(%esi), %eax - add %esi, 88(%eax) + add %esi, 32(%eax) pushl %eax 2: call efi_main @@ -264,7 +264,7 @@ relocated: #ifdef CONFIG_EFI_STUB .data efi32_config: - .fill 11,8,0 + .fill 4,8,0 .long efi_call_phys .long 0 .byte 0 diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 0d80a7a..efdfba2 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -265,7 +265,7 @@ ENTRY(efi_pe_entry) /* * Relocate efi_config->call(). */ - addq %rbp, efi64_config+88(%rip) + addq %rbp, efi64_config+32(%rip) movq %rax, %rdi call make_boot_params @@ -285,7 +285,7 @@ handover_entry: * Relocate efi_config->call(). */ movq efi_config(%rip), %rax - addq %rbp, 88(%rax) + addq %rbp, 32(%rax) 2: movq efi_config(%rip), %rdi call efi_main @@ -457,14 +457,14 @@ efi_config: #ifdef CONFIG_EFI_MIXED .global efi32_config efi32_config: - .fill 11,8,0 + .fill 4,8,0 .quad efi64_thunk .byte 0 #endif .global efi64_config efi64_config: - .fill 11,8,0 + .fill 4,8,0 .quad efi_call .byte 1 #endif /* CONFIG_EFI_STUB */ diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index d172c61..02fff3e 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1002,7 +1002,6 @@ ENTRY(error_entry) testb $3, CS+8(%rsp) jz .Lerror_kernelspace -.Lerror_entry_from_usermode_swapgs: /* * We entered from user mode or we're pretending to have entered * from user mode due to an IRET fault. @@ -1045,7 +1044,8 @@ ENTRY(error_entry) * gsbase and proceed. We'll fix up the exception and land in * .Lgs_change's error handler with kernel gsbase. */ - jmp .Lerror_entry_from_usermode_swapgs + SWAPGS + jmp .Lerror_entry_done .Lbstep_iret: /* Fix truncated RIP */ diff --git a/arch/x86/entry/vdso/vdso2c.h b/arch/x86/entry/vdso/vdso2c.h index 4f74119..3dab75f 100644 --- a/arch/x86/entry/vdso/vdso2c.h +++ b/arch/x86/entry/vdso/vdso2c.h @@ -22,7 +22,7 @@ static void BITSFUNC(go)(void *raw_addr, size_t raw_len, ELF(Phdr) *pt = (ELF(Phdr) *)(raw_addr + GET_LE(&hdr->e_phoff)); - if (hdr->e_type != ET_DYN) + if (GET_LE(&hdr->e_type) != ET_DYN) fail("input is not a shared object\n"); /* Walk the segment table. */ diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index e07a22b..f5f4b3f 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -119,8 +119,8 @@ static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] = { [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d, + [PERF_COUNT_HW_CACHE_MISSES] = 0x077e, [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ diff --git a/arch/x86/events/amd/uncore.c b/arch/x86/events/amd/uncore.c index e6131d4..65577f0 100644 --- a/arch/x86/events/amd/uncore.c +++ b/arch/x86/events/amd/uncore.c @@ -29,6 +29,8 @@ #define COUNTER_SHIFT 16 +static HLIST_HEAD(uncore_unused_list); + struct amd_uncore { int id; int refcnt; @@ -39,7 +41,7 @@ struct amd_uncore { cpumask_t *active_mask; struct pmu *pmu; struct perf_event *events[MAX_COUNTERS]; - struct amd_uncore *free_when_cpu_online; + struct hlist_node node; }; static struct amd_uncore * __percpu *amd_uncore_nb; @@ -306,6 +308,7 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) uncore_nb->msr_base = MSR_F15H_NB_PERF_CTL; uncore_nb->active_mask = &amd_nb_active_mask; uncore_nb->pmu = &amd_nb_pmu; + uncore_nb->id = -1; *per_cpu_ptr(amd_uncore_nb, cpu) = uncore_nb; } @@ -319,6 +322,7 @@ static int amd_uncore_cpu_up_prepare(unsigned int cpu) uncore_l2->msr_base = MSR_F16H_L2I_PERF_CTL; uncore_l2->active_mask = &amd_l2_active_mask; uncore_l2->pmu = &amd_l2_pmu; + uncore_l2->id = -1; *per_cpu_ptr(amd_uncore_l2, cpu) = uncore_l2; } @@ -348,7 +352,7 @@ amd_uncore_find_online_sibling(struct amd_uncore *this, continue; if (this->id == that->id) { - that->free_when_cpu_online = this; + hlist_add_head(&this->node, &uncore_unused_list); this = that; break; } @@ -388,13 +392,23 @@ static int amd_uncore_cpu_starting(unsigned int cpu) return 0; } +static void uncore_clean_online(void) +{ + struct amd_uncore *uncore; + struct hlist_node *n; + + hlist_for_each_entry_safe(uncore, n, &uncore_unused_list, node) { + hlist_del(&uncore->node); + kfree(uncore); + } +} + static void uncore_online(unsigned int cpu, struct amd_uncore * __percpu *uncores) { struct amd_uncore *uncore = *per_cpu_ptr(uncores, cpu); - kfree(uncore->free_when_cpu_online); - uncore->free_when_cpu_online = NULL; + uncore_clean_online(); if (cpu == uncore->cpu) cpumask_set_cpu(cpu, uncore->active_mask); diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c index d0efb5c..18a1acf 100644 --- a/arch/x86/events/core.c +++ b/arch/x86/events/core.c @@ -1201,6 +1201,9 @@ static int x86_pmu_add(struct perf_event *event, int flags) * If group events scheduling transaction was started, * skip the schedulability test here, it will be performed * at commit time (->commit_txn) as a whole. + * + * If commit fails, we'll call ->del() on all events + * for which ->add() was called. */ if (cpuc->txn_flags & PERF_PMU_TXN_ADD) goto done_collect; @@ -1223,6 +1226,14 @@ done_collect: cpuc->n_added += n - n0; cpuc->n_txn += n - n0; + if (x86_pmu.add) { + /* + * This is before x86_pmu_enable() will call x86_pmu_start(), + * so we enable LBRs before an event needs them etc.. + */ + x86_pmu.add(event); + } + ret = 0; out: return ret; @@ -1346,7 +1357,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) event->hw.flags &= ~PERF_X86_EVENT_COMMITTED; /* - * If we're called during a txn, we don't need to do anything. + * If we're called during a txn, we only need to undo x86_pmu.add. * The events never got scheduled and ->cancel_txn will truncate * the event_list. * @@ -1354,7 +1365,7 @@ static void x86_pmu_del(struct perf_event *event, int flags) * an event added during that same TXN. */ if (cpuc->txn_flags & PERF_PMU_TXN_ADD) - return; + goto do_del; /* * Not a TXN, therefore cleanup properly. @@ -1384,6 +1395,15 @@ static void x86_pmu_del(struct perf_event *event, int flags) --cpuc->n_events; perf_event_update_userpage(event); + +do_del: + if (x86_pmu.del) { + /* + * This is after x86_pmu_stop(); so we disable LBRs after any + * event can need them etc.. + */ + x86_pmu.del(event); + } } int x86_pmu_handle_irq(struct pt_regs *regs) diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c index 0a6e393..982c9e3 100644 --- a/arch/x86/events/intel/bts.c +++ b/arch/x86/events/intel/bts.c @@ -31,7 +31,17 @@ struct bts_ctx { struct perf_output_handle handle; struct debug_store ds_back; - int started; + int state; +}; + +/* BTS context states: */ +enum { + /* no ongoing AUX transactions */ + BTS_STATE_STOPPED = 0, + /* AUX transaction is on, BTS tracing is disabled */ + BTS_STATE_INACTIVE, + /* AUX transaction is on, BTS tracing is running */ + BTS_STATE_ACTIVE, }; static DEFINE_PER_CPU(struct bts_ctx, bts_ctx); @@ -204,6 +214,15 @@ static void bts_update(struct bts_ctx *bts) static int bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle); +/* + * Ordering PMU callbacks wrt themselves and the PMI is done by means + * of bts::state, which: + * - is set when bts::handle::event is valid, that is, between + * perf_aux_output_begin() and perf_aux_output_end(); + * - is zero otherwise; + * - is ordered against bts::handle::event with a compiler barrier. + */ + static void __bts_event_start(struct perf_event *event) { struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); @@ -221,10 +240,13 @@ static void __bts_event_start(struct perf_event *event) /* * local barrier to make sure that ds configuration made it - * before we enable BTS + * before we enable BTS and bts::state goes ACTIVE */ wmb(); + /* INACTIVE/STOPPED -> ACTIVE */ + WRITE_ONCE(bts->state, BTS_STATE_ACTIVE); + intel_pmu_enable_bts(config); } @@ -251,9 +273,6 @@ static void bts_event_start(struct perf_event *event, int flags) __bts_event_start(event); - /* PMI handler: this counter is running and likely generating PMIs */ - ACCESS_ONCE(bts->started) = 1; - return; fail_end_stop: @@ -263,30 +282,34 @@ fail_stop: event->hw.state = PERF_HES_STOPPED; } -static void __bts_event_stop(struct perf_event *event) +static void __bts_event_stop(struct perf_event *event, int state) { + struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + + /* ACTIVE -> INACTIVE(PMI)/STOPPED(->stop()) */ + WRITE_ONCE(bts->state, state); + /* * No extra synchronization is mandated by the documentation to have * BTS data stores globally visible. */ intel_pmu_disable_bts(); - - if (event->hw.state & PERF_HES_STOPPED) - return; - - ACCESS_ONCE(event->hw.state) |= PERF_HES_STOPPED; } static void bts_event_stop(struct perf_event *event, int flags) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); - struct bts_buffer *buf = perf_get_aux(&bts->handle); + struct bts_buffer *buf = NULL; + int state = READ_ONCE(bts->state); - /* PMI handler: don't restart this counter */ - ACCESS_ONCE(bts->started) = 0; + if (state == BTS_STATE_ACTIVE) + __bts_event_stop(event, BTS_STATE_STOPPED); - __bts_event_stop(event); + if (state != BTS_STATE_STOPPED) + buf = perf_get_aux(&bts->handle); + + event->hw.state |= PERF_HES_STOPPED; if (flags & PERF_EF_UPDATE) { bts_update(bts); @@ -296,6 +319,7 @@ static void bts_event_stop(struct perf_event *event, int flags) bts->handle.head = local_xchg(&buf->data_size, buf->nr_pages << PAGE_SHIFT); + perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), !!local_xchg(&buf->lost, 0)); } @@ -310,8 +334,20 @@ static void bts_event_stop(struct perf_event *event, int flags) void intel_bts_enable_local(void) { struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + int state = READ_ONCE(bts->state); + + /* + * Here we transition from INACTIVE to ACTIVE; + * if we instead are STOPPED from the interrupt handler, + * stay that way. Can't be ACTIVE here though. + */ + if (WARN_ON_ONCE(state == BTS_STATE_ACTIVE)) + return; + + if (state == BTS_STATE_STOPPED) + return; - if (bts->handle.event && bts->started) + if (bts->handle.event) __bts_event_start(bts->handle.event); } @@ -319,8 +355,15 @@ void intel_bts_disable_local(void) { struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); + /* + * Here we transition from ACTIVE to INACTIVE; + * do nothing for STOPPED or INACTIVE. + */ + if (READ_ONCE(bts->state) != BTS_STATE_ACTIVE) + return; + if (bts->handle.event) - __bts_event_stop(bts->handle.event); + __bts_event_stop(bts->handle.event, BTS_STATE_INACTIVE); } static int @@ -335,8 +378,6 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) return 0; head = handle->head & ((buf->nr_pages << PAGE_SHIFT) - 1); - if (WARN_ON_ONCE(head != local_read(&buf->head))) - return -EINVAL; phys = &buf->buf[buf->cur_buf]; space = phys->offset + phys->displacement + phys->size - head; @@ -403,22 +444,37 @@ bts_buffer_reset(struct bts_buffer *buf, struct perf_output_handle *handle) int intel_bts_interrupt(void) { + struct debug_store *ds = this_cpu_ptr(&cpu_hw_events)->ds; struct bts_ctx *bts = this_cpu_ptr(&bts_ctx); struct perf_event *event = bts->handle.event; struct bts_buffer *buf; s64 old_head; - int err; + int err = -ENOSPC, handled = 0; - if (!event || !bts->started) - return 0; + /* + * The only surefire way of knowing if this NMI is ours is by checking + * the write ptr against the PMI threshold. + */ + if (ds && (ds->bts_index >= ds->bts_interrupt_threshold)) + handled = 1; + + /* + * this is wrapped in intel_bts_enable_local/intel_bts_disable_local, + * so we can only be INACTIVE or STOPPED + */ + if (READ_ONCE(bts->state) == BTS_STATE_STOPPED) + return handled; buf = perf_get_aux(&bts->handle); + if (!buf) + return handled; + /* * Skip snapshot counters: they don't use the interrupt, but * there's no other way of telling, because the pointer will * keep moving */ - if (!buf || buf->snapshot) + if (buf->snapshot) return 0; old_head = local_read(&buf->head); @@ -426,18 +482,27 @@ int intel_bts_interrupt(void) /* no new data */ if (old_head == local_read(&buf->head)) - return 0; + return handled; perf_aux_output_end(&bts->handle, local_xchg(&buf->data_size, 0), !!local_xchg(&buf->lost, 0)); buf = perf_aux_output_begin(&bts->handle, event); - if (!buf) - return 1; + if (buf) + err = bts_buffer_reset(buf, &bts->handle); + + if (err) { + WRITE_ONCE(bts->state, BTS_STATE_STOPPED); - err = bts_buffer_reset(buf, &bts->handle); - if (err) - perf_aux_output_end(&bts->handle, 0, false); + if (buf) { + /* + * BTS_STATE_STOPPED should be visible before + * cleared handle::event + */ + barrier(); + perf_aux_output_end(&bts->handle, 0, false); + } + } return 1; } @@ -519,7 +584,8 @@ static __init int bts_init(void) if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) return -ENODEV; - bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE; + bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE | + PERF_PMU_CAP_EXCLUSIVE; bts_pmu.task_ctx_nr = perf_sw_context; bts_pmu.event_init = bts_event_init; bts_pmu.add = bts_event_add; diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 2cbde2f..a3a9eb8 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -1730,9 +1730,11 @@ static __initconst const u64 knl_hw_cache_extra_regs * disabled state if called consecutively. * * During consecutive calls, the same disable value will be written to related - * registers, so the PMU state remains unchanged. hw.state in - * intel_bts_disable_local will remain PERF_HES_STOPPED too in consecutive - * calls. + * registers, so the PMU state remains unchanged. + * + * intel_bts events don't coexist with intel PMU's BTS events because of + * x86_add_exclusive(x86_lbr_exclusive_lbr); there's no need to keep them + * disabled around intel PMU's event batching etc, only inside the PMI handler. */ static void __intel_pmu_disable_all(void) { @@ -1742,8 +1744,6 @@ static void __intel_pmu_disable_all(void) if (test_bit(INTEL_PMC_IDX_FIXED_BTS, cpuc->active_mask)) intel_pmu_disable_bts(); - else - intel_bts_disable_local(); intel_pmu_pebs_disable_all(); } @@ -1771,8 +1771,7 @@ static void __intel_pmu_enable_all(int added, bool pmi) return; intel_pmu_enable_bts(event->hw.config); - } else - intel_bts_enable_local(); + } } static void intel_pmu_enable_all(int added) @@ -1907,13 +1906,6 @@ static void intel_pmu_disable_event(struct perf_event *event) cpuc->intel_ctrl_host_mask &= ~(1ull << hwc->idx); cpuc->intel_cp_status &= ~(1ull << hwc->idx); - /* - * must disable before any actual event - * because any event may be combined with LBR - */ - if (needs_branch_stack(event)) - intel_pmu_lbr_disable(event); - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { intel_pmu_disable_fixed(hwc); return; @@ -1925,6 +1917,14 @@ static void intel_pmu_disable_event(struct perf_event *event) intel_pmu_pebs_disable(event); } +static void intel_pmu_del_event(struct perf_event *event) +{ + if (needs_branch_stack(event)) + intel_pmu_lbr_del(event); + if (event->attr.precise_ip) + intel_pmu_pebs_del(event); +} + static void intel_pmu_enable_fixed(struct hw_perf_event *hwc) { int idx = hwc->idx - INTEL_PMC_IDX_FIXED; @@ -1968,12 +1968,6 @@ static void intel_pmu_enable_event(struct perf_event *event) intel_pmu_enable_bts(hwc->config); return; } - /* - * must enabled before any actual event - * because any event may be combined with LBR - */ - if (needs_branch_stack(event)) - intel_pmu_lbr_enable(event); if (event->attr.exclude_host) cpuc->intel_ctrl_guest_mask |= (1ull << hwc->idx); @@ -1994,6 +1988,14 @@ static void intel_pmu_enable_event(struct perf_event *event) __x86_pmu_enable_event(hwc, ARCH_PERFMON_EVENTSEL_ENABLE); } +static void intel_pmu_add_event(struct perf_event *event) +{ + if (event->attr.precise_ip) + intel_pmu_pebs_add(event); + if (needs_branch_stack(event)) + intel_pmu_lbr_add(event); +} + /* * Save and restart an expired event. Called by NMI contexts, * so it has to be careful about preempting normal event ops: @@ -2073,6 +2075,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) */ if (!x86_pmu.late_ack) apic_write(APIC_LVTPC, APIC_DM_NMI); + intel_bts_disable_local(); __intel_pmu_disable_all(); handled = intel_pmu_drain_bts_buffer(); handled += intel_bts_interrupt(); @@ -2172,6 +2175,7 @@ done: /* Only restore PMU state when it's active. See x86_pmu_disable(). */ if (cpuc->enabled) __intel_pmu_enable_all(0, true); + intel_bts_enable_local(); /* * Only unmask the NMI after the overflow counters @@ -3290,6 +3294,8 @@ static __initconst const struct x86_pmu intel_pmu = { .enable_all = intel_pmu_enable_all, .enable = intel_pmu_enable_event, .disable = intel_pmu_disable_event, + .add = intel_pmu_add_event, + .del = intel_pmu_del_event, .hw_config = intel_pmu_hw_config, .schedule_events = x86_schedule_events, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, diff --git a/arch/x86/events/intel/cqm.c b/arch/x86/events/intel/cqm.c index 783c49d..8f82b02 100644 --- a/arch/x86/events/intel/cqm.c +++ b/arch/x86/events/intel/cqm.c @@ -458,6 +458,11 @@ static void __intel_cqm_event_count(void *info); static void init_mbm_sample(u32 rmid, u32 evt_type); static void __intel_mbm_event_count(void *info); +static bool is_cqm_event(int e) +{ + return (e == QOS_L3_OCCUP_EVENT_ID); +} + static bool is_mbm_event(int e) { return (e >= QOS_MBM_TOTAL_EVENT_ID && e <= QOS_MBM_LOCAL_EVENT_ID); @@ -1366,6 +1371,10 @@ static int intel_cqm_event_init(struct perf_event *event) (event->attr.config > QOS_MBM_LOCAL_EVENT_ID)) return -EINVAL; + if ((is_cqm_event(event->attr.config) && !cqm_enabled) || + (is_mbm_event(event->attr.config) && !mbm_enabled)) + return -EINVAL; + /* unsupported modes and filters */ if (event->attr.exclude_user || event->attr.exclude_kernel || diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 7ce9f3f..0319311 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -806,9 +806,65 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event) return &emptyconstraint; } -static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc) +/* + * We need the sched_task callback even for per-cpu events when we use + * the large interrupt threshold, such that we can provide PID and TID + * to PEBS samples. + */ +static inline bool pebs_needs_sched_cb(struct cpu_hw_events *cpuc) +{ + return cpuc->n_pebs && (cpuc->n_pebs == cpuc->n_large_pebs); +} + +static inline void pebs_update_threshold(struct cpu_hw_events *cpuc) +{ + struct debug_store *ds = cpuc->ds; + u64 threshold; + + if (cpuc->n_pebs == cpuc->n_large_pebs) { + threshold = ds->pebs_absolute_maximum - + x86_pmu.max_pebs_events * x86_pmu.pebs_record_size; + } else { + threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size; + } + + ds->pebs_interrupt_threshold = threshold; +} + +static void +pebs_update_state(bool needed_cb, struct cpu_hw_events *cpuc, struct pmu *pmu) { - return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1)); + /* + * Make sure we get updated with the first PEBS + * event. It will trigger also during removal, but + * that does not hurt: + */ + bool update = cpuc->n_pebs == 1; + + if (needed_cb != pebs_needs_sched_cb(cpuc)) { + if (!needed_cb) + perf_sched_cb_inc(pmu); + else + perf_sched_cb_dec(pmu); + + update = true; + } + + if (update) + pebs_update_threshold(cpuc); +} + +void intel_pmu_pebs_add(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + bool needed_cb = pebs_needs_sched_cb(cpuc); + + cpuc->n_pebs++; + if (hwc->flags & PERF_X86_EVENT_FREERUNNING) + cpuc->n_large_pebs++; + + pebs_update_state(needed_cb, cpuc, event->ctx->pmu); } void intel_pmu_pebs_enable(struct perf_event *event) @@ -816,12 +872,9 @@ void intel_pmu_pebs_enable(struct perf_event *event) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; struct debug_store *ds = cpuc->ds; - bool first_pebs; - u64 threshold; hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; - first_pebs = !pebs_is_enabled(cpuc); cpuc->pebs_enabled |= 1ULL << hwc->idx; if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) @@ -830,46 +883,34 @@ void intel_pmu_pebs_enable(struct perf_event *event) cpuc->pebs_enabled |= 1ULL << 63; /* - * When the event is constrained enough we can use a larger - * threshold and run the event with less frequent PMI. + * Use auto-reload if possible to save a MSR write in the PMI. + * This must be done in pmu::start(), because PERF_EVENT_IOC_PERIOD. */ - if (hwc->flags & PERF_X86_EVENT_FREERUNNING) { - threshold = ds->pebs_absolute_maximum - - x86_pmu.max_pebs_events * x86_pmu.pebs_record_size; - - if (first_pebs) - perf_sched_cb_inc(event->ctx->pmu); - } else { - threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size; - - /* - * If not all events can use larger buffer, - * roll back to threshold = 1 - */ - if (!first_pebs && - (ds->pebs_interrupt_threshold > threshold)) - perf_sched_cb_dec(event->ctx->pmu); - } - - /* Use auto-reload if possible to save a MSR write in the PMI */ if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { ds->pebs_event_reset[hwc->idx] = (u64)(-hwc->sample_period) & x86_pmu.cntval_mask; } +} + +void intel_pmu_pebs_del(struct perf_event *event) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + struct hw_perf_event *hwc = &event->hw; + bool needed_cb = pebs_needs_sched_cb(cpuc); - if (first_pebs || ds->pebs_interrupt_threshold > threshold) - ds->pebs_interrupt_threshold = threshold; + cpuc->n_pebs--; + if (hwc->flags & PERF_X86_EVENT_FREERUNNING) + cpuc->n_large_pebs--; + + pebs_update_state(needed_cb, cpuc, event->ctx->pmu); } void intel_pmu_pebs_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; - struct debug_store *ds = cpuc->ds; - bool large_pebs = ds->pebs_interrupt_threshold > - ds->pebs_buffer_base + x86_pmu.pebs_record_size; - if (large_pebs) + if (cpuc->n_pebs == cpuc->n_large_pebs) intel_pmu_drain_pebs_buffer(); cpuc->pebs_enabled &= ~(1ULL << hwc->idx); @@ -879,9 +920,6 @@ void intel_pmu_pebs_disable(struct perf_event *event) else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled &= ~(1ULL << 63); - if (large_pebs && !pebs_is_enabled(cpuc)) - perf_sched_cb_dec(event->ctx->pmu); - if (cpuc->enabled) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); @@ -1274,18 +1312,18 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) struct pebs_record_nhm *p = at; u64 pebs_status; - /* PEBS v3 has accurate status bits */ + pebs_status = p->status & cpuc->pebs_enabled; + pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1; + + /* PEBS v3 has more accurate status bits */ if (x86_pmu.intel_cap.pebs_format >= 3) { - for_each_set_bit(bit, (unsigned long *)&p->status, - MAX_PEBS_EVENTS) + for_each_set_bit(bit, (unsigned long *)&pebs_status, + x86_pmu.max_pebs_events) counts[bit]++; continue; } - pebs_status = p->status & cpuc->pebs_enabled; - pebs_status &= (1ULL << x86_pmu.max_pebs_events) - 1; - /* * On some CPUs the PEBS status can be zero when PEBS is * racing with clearing of GLOBAL_STATUS. @@ -1333,8 +1371,11 @@ static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) continue; event = cpuc->events[bit]; - WARN_ON_ONCE(!event); - WARN_ON_ONCE(!event->attr.precise_ip); + if (WARN_ON_ONCE(!event)) + continue; + + if (WARN_ON_ONCE(!event->attr.precise_ip)) + continue; /* log dropped samples number */ if (error[bit]) diff --git a/arch/x86/events/intel/lbr.c b/arch/x86/events/intel/lbr.c index 707d358..fc6cf21 100644 --- a/arch/x86/events/intel/lbr.c +++ b/arch/x86/events/intel/lbr.c @@ -380,7 +380,6 @@ static void __intel_pmu_lbr_save(struct x86_perf_task_context *task_ctx) void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) { - struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx; /* @@ -390,31 +389,21 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) */ task_ctx = ctx ? ctx->task_ctx_data : NULL; if (task_ctx) { - if (sched_in) { + if (sched_in) __intel_pmu_lbr_restore(task_ctx); - cpuc->lbr_context = ctx; - } else { + else __intel_pmu_lbr_save(task_ctx); - } return; } /* - * When sampling the branck stack in system-wide, it may be - * necessary to flush the stack on context switch. This happens - * when the branch stack does not tag its entries with the pid - * of the current task. Otherwise it becomes impossible to - * associate a branch entry with a task. This ambiguity is more - * likely to appear when the branch stack supports priv level - * filtering and the user sets it to monitor only at the user - * level (which could be a useful measurement in system-wide - * mode). In that case, the risk is high of having a branch - * stack with branch from multiple tasks. - */ - if (sched_in) { + * Since a context switch can flip the address space and LBR entries + * are not tagged with an identifier, we need to wipe the LBR, even for + * per-cpu events. You simply cannot resolve the branches from the old + * address space. + */ + if (sched_in) intel_pmu_lbr_reset(); - cpuc->lbr_context = ctx; - } } static inline bool branch_user_callstack(unsigned br_sel) @@ -422,7 +411,7 @@ static inline bool branch_user_callstack(unsigned br_sel) return (br_sel & X86_BR_USER) && (br_sel & X86_BR_CALL_STACK); } -void intel_pmu_lbr_enable(struct perf_event *event) +void intel_pmu_lbr_add(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx; @@ -430,27 +419,38 @@ void intel_pmu_lbr_enable(struct perf_event *event) if (!x86_pmu.lbr_nr) return; - /* - * Reset the LBR stack if we changed task context to - * avoid data leaks. - */ - if (event->ctx->task && cpuc->lbr_context != event->ctx) { - intel_pmu_lbr_reset(); - cpuc->lbr_context = event->ctx; - } cpuc->br_sel = event->hw.branch_reg.reg; - if (branch_user_callstack(cpuc->br_sel) && event->ctx && - event->ctx->task_ctx_data) { + if (branch_user_callstack(cpuc->br_sel) && event->ctx->task_ctx_data) { task_ctx = event->ctx->task_ctx_data; task_ctx->lbr_callstack_users++; } - cpuc->lbr_users++; + /* + * Request pmu::sched_task() callback, which will fire inside the + * regular perf event scheduling, so that call will: + * + * - restore or wipe; when LBR-callstack, + * - wipe; otherwise, + * + * when this is from __perf_event_task_sched_in(). + * + * However, if this is from perf_install_in_context(), no such callback + * will follow and we'll need to reset the LBR here if this is the + * first LBR event. + * + * The problem is, we cannot tell these cases apart... but we can + * exclude the biggest chunk of cases by looking at + * event->total_time_running. An event that has accrued runtime cannot + * be 'new'. Conversely, a new event can get installed through the + * context switch path for the first time. + */ perf_sched_cb_inc(event->ctx->pmu); + if (!cpuc->lbr_users++ && !event->total_time_running) + intel_pmu_lbr_reset(); } -void intel_pmu_lbr_disable(struct perf_event *event) +void intel_pmu_lbr_del(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx; @@ -467,12 +467,6 @@ void intel_pmu_lbr_disable(struct perf_event *event) cpuc->lbr_users--; WARN_ON_ONCE(cpuc->lbr_users < 0); perf_sched_cb_dec(event->ctx->pmu); - - if (cpuc->enabled && !cpuc->lbr_users) { - __intel_pmu_lbr_disable(); - /* avoid stale pointer */ - cpuc->lbr_context = NULL; - } } void intel_pmu_lbr_enable_all(bool pmi) diff --git a/arch/x86/events/intel/pt.c b/arch/x86/events/intel/pt.c index 04bb5fb..c5047b8 100644 --- a/arch/x86/events/intel/pt.c +++ b/arch/x86/events/intel/pt.c @@ -69,6 +69,8 @@ static struct pt_cap_desc { PT_CAP(psb_cyc, 0, CR_EBX, BIT(1)), PT_CAP(ip_filtering, 0, CR_EBX, BIT(2)), PT_CAP(mtc, 0, CR_EBX, BIT(3)), + PT_CAP(ptwrite, 0, CR_EBX, BIT(4)), + PT_CAP(power_event_trace, 0, CR_EBX, BIT(5)), PT_CAP(topa_output, 0, CR_ECX, BIT(0)), PT_CAP(topa_multiple_entries, 0, CR_ECX, BIT(1)), PT_CAP(single_range_output, 0, CR_ECX, BIT(2)), @@ -259,10 +261,16 @@ fail: #define RTIT_CTL_MTC (RTIT_CTL_MTC_EN | \ RTIT_CTL_MTC_RANGE) +#define RTIT_CTL_PTW (RTIT_CTL_PTW_EN | \ + RTIT_CTL_FUP_ON_PTW) + #define PT_CONFIG_MASK (RTIT_CTL_TSC_EN | \ RTIT_CTL_DISRETC | \ RTIT_CTL_CYC_PSB | \ - RTIT_CTL_MTC) + RTIT_CTL_MTC | \ + RTIT_CTL_PWR_EVT_EN | \ + RTIT_CTL_FUP_ON_PTW | \ + RTIT_CTL_PTW_EN) static bool pt_event_valid(struct perf_event *event) { @@ -311,6 +319,20 @@ static bool pt_event_valid(struct perf_event *event) return false; } + if (config & RTIT_CTL_PWR_EVT_EN && + !pt_cap_get(PT_CAP_power_event_trace)) + return false; + + if (config & RTIT_CTL_PTW) { + if (!pt_cap_get(PT_CAP_ptwrite)) + return false; + + /* FUPonPTW without PTW doesn't make sense */ + if ((config & RTIT_CTL_FUP_ON_PTW) && + !(config & RTIT_CTL_PTW_EN)) + return false; + } + return true; } @@ -1074,6 +1096,11 @@ static void pt_addr_filters_fini(struct perf_event *event) event->hw.addr_filters = NULL; } +static inline bool valid_kernel_ip(unsigned long ip) +{ + return virt_addr_valid(ip) && kernel_ip(ip); +} + static int pt_event_addr_filters_validate(struct list_head *filters) { struct perf_addr_filter *filter; @@ -1081,11 +1108,16 @@ static int pt_event_addr_filters_validate(struct list_head *filters) list_for_each_entry(filter, filters, entry) { /* PT doesn't support single address triggers */ - if (!filter->range) + if (!filter->range || !filter->size) return -EOPNOTSUPP; - if (!filter->inode && !kernel_ip(filter->offset)) - return -EINVAL; + if (!filter->inode) { + if (!valid_kernel_ip(filter->offset)) + return -EINVAL; + + if (!valid_kernel_ip(filter->offset + filter->size)) + return -EINVAL; + } if (++range > pt_cap_get(PT_CAP_num_address_ranges)) return -EOPNOTSUPP; @@ -1111,7 +1143,7 @@ static void pt_event_addr_filters_sync(struct perf_event *event) } else { /* apply the offset */ msr_a = filter->offset + offs[range]; - msr_b = filter->size + msr_a; + msr_b = filter->size + msr_a - 1; } filters->filter[range].msr_a = msr_a; diff --git a/arch/x86/events/intel/pt.h b/arch/x86/events/intel/pt.h index efffa4a..53473c2 100644 --- a/arch/x86/events/intel/pt.h +++ b/arch/x86/events/intel/pt.h @@ -26,11 +26,14 @@ #define RTIT_CTL_CYCLEACC BIT(1) #define RTIT_CTL_OS BIT(2) #define RTIT_CTL_USR BIT(3) +#define RTIT_CTL_PWR_EVT_EN BIT(4) +#define RTIT_CTL_FUP_ON_PTW BIT(5) #define RTIT_CTL_CR3EN BIT(7) #define RTIT_CTL_TOPA BIT(8) #define RTIT_CTL_MTC_EN BIT(9) #define RTIT_CTL_TSC_EN BIT(10) #define RTIT_CTL_DISRETC BIT(11) +#define RTIT_CTL_PTW_EN BIT(12) #define RTIT_CTL_BRANCH_EN BIT(13) #define RTIT_CTL_MTC_RANGE_OFFSET 14 #define RTIT_CTL_MTC_RANGE (0x0full << RTIT_CTL_MTC_RANGE_OFFSET) @@ -91,6 +94,8 @@ enum pt_capabilities { PT_CAP_psb_cyc, PT_CAP_ip_filtering, PT_CAP_mtc, + PT_CAP_ptwrite, + PT_CAP_power_event_trace, PT_CAP_topa_output, PT_CAP_topa_multiple_entries, PT_CAP_single_range_output, diff --git a/arch/x86/events/intel/rapl.c b/arch/x86/events/intel/rapl.c index 2886593..b0f0e83 100644 --- a/arch/x86/events/intel/rapl.c +++ b/arch/x86/events/intel/rapl.c @@ -357,6 +357,8 @@ static int rapl_pmu_event_init(struct perf_event *event) if (event->cpu < 0) return -EINVAL; + event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; + /* * check event is known (determines counter) */ @@ -765,6 +767,8 @@ static const struct x86_cpu_id rapl_cpu_match[] __initconst = { X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP, skl_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, hsx_rapl_init), + + X86_RAPL_MODEL_MATCH(INTEL_FAM6_ATOM_GOLDMONT, hsw_rapl_init), {}, }; diff --git a/arch/x86/events/intel/uncore.c b/arch/x86/events/intel/uncore.c index 463dc7a..d9844cc 100644 --- a/arch/x86/events/intel/uncore.c +++ b/arch/x86/events/intel/uncore.c @@ -664,6 +664,8 @@ static int uncore_pmu_event_init(struct perf_event *event) event->cpu = box->cpu; event->pmu_private = box; + event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; + event->hw.idx = -1; event->hw.last_tag = ~0ULL; event->hw.extra_reg.idx = EXTRA_REG_NONE; @@ -683,7 +685,8 @@ static int uncore_pmu_event_init(struct perf_event *event) /* fixed counters have event field hardcoded to zero */ hwc->config = 0ULL; } else { - hwc->config = event->attr.config & pmu->type->event_mask; + hwc->config = event->attr.config & + (pmu->type->event_mask | ((u64)pmu->type->event_mask_ext << 32)); if (pmu->type->ops->hw_config) { ret = pmu->type->ops->hw_config(box, event); if (ret) @@ -1321,6 +1324,11 @@ static const struct intel_uncore_init_fun skl_uncore_init __initconst = { .pci_init = skl_uncore_pci_init, }; +static const struct intel_uncore_init_fun skx_uncore_init __initconst = { + .cpu_init = skx_uncore_cpu_init, + .pci_init = skx_uncore_pci_init, +}; + static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM_EP, nhm_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_NEHALEM, nhm_uncore_init), @@ -1343,6 +1351,7 @@ static const struct x86_cpu_id intel_uncore_match[] __initconst = { X86_UNCORE_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_DESKTOP,skl_uncore_init), X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_MOBILE, skl_uncore_init), + X86_UNCORE_MODEL_MATCH(INTEL_FAM6_SKYLAKE_X, skx_uncore_init), {}, }; diff --git a/arch/x86/events/intel/uncore.h b/arch/x86/events/intel/uncore.h index 78b9c23..ad986c1 100644 --- a/arch/x86/events/intel/uncore.h +++ b/arch/x86/events/intel/uncore.h @@ -44,6 +44,7 @@ struct intel_uncore_type { unsigned perf_ctr; unsigned event_ctl; unsigned event_mask; + unsigned event_mask_ext; unsigned fixed_ctr; unsigned fixed_ctl; unsigned box_ctl; @@ -120,6 +121,7 @@ struct intel_uncore_box { }; #define UNCORE_BOX_FLAG_INITIATED 0 +#define UNCORE_BOX_FLAG_CTL_OFFS8 1 /* event config registers are 8-byte apart */ struct uncore_event_desc { struct kobj_attribute attr; @@ -172,6 +174,9 @@ static inline unsigned uncore_pci_fixed_ctr(struct intel_uncore_box *box) static inline unsigned uncore_pci_event_ctl(struct intel_uncore_box *box, int idx) { + if (test_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags)) + return idx * 8 + box->pmu->type->event_ctl; + return idx * 4 + box->pmu->type->event_ctl; } @@ -377,6 +382,8 @@ int bdx_uncore_pci_init(void); void bdx_uncore_cpu_init(void); int knl_uncore_pci_init(void); void knl_uncore_cpu_init(void); +int skx_uncore_pci_init(void); +void skx_uncore_cpu_init(void); /* perf_event_intel_uncore_nhmex.c */ void nhmex_uncore_cpu_init(void); diff --git a/arch/x86/events/intel/uncore_snb.c b/arch/x86/events/intel/uncore_snb.c index 9d35ec0..5f845ee 100644 --- a/arch/x86/events/intel/uncore_snb.c +++ b/arch/x86/events/intel/uncore_snb.c @@ -388,6 +388,8 @@ static int snb_uncore_imc_event_init(struct perf_event *event) event->cpu = box->cpu; event->pmu_private = box; + event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; + event->hw.idx = -1; event->hw.last_tag = ~0ULL; event->hw.extra_reg.idx = EXTRA_REG_NONE; diff --git a/arch/x86/events/intel/uncore_snbep.c b/arch/x86/events/intel/uncore_snbep.c index 8aee83b..2724277 100644 --- a/arch/x86/events/intel/uncore_snbep.c +++ b/arch/x86/events/intel/uncore_snbep.c @@ -1,6 +1,10 @@ /* SandyBridge-EP/IvyTown uncore support */ #include "uncore.h" +/* SNB-EP pci bus to socket mapping */ +#define SNBEP_CPUNODEID 0x40 +#define SNBEP_GIDNIDMAP 0x54 + /* SNB-EP Box level control */ #define SNBEP_PMON_BOX_CTL_RST_CTRL (1 << 0) #define SNBEP_PMON_BOX_CTL_RST_CTRS (1 << 1) @@ -264,15 +268,72 @@ SNBEP_PCU_MSR_PMON_CTL_OCC_INVERT | \ SNBEP_PCU_MSR_PMON_CTL_OCC_EDGE_DET) +/* SKX pci bus to socket mapping */ +#define SKX_CPUNODEID 0xc0 +#define SKX_GIDNIDMAP 0xd4 + +/* SKX CHA */ +#define SKX_CHA_MSR_PMON_BOX_FILTER_TID (0x1ffULL << 0) +#define SKX_CHA_MSR_PMON_BOX_FILTER_LINK (0xfULL << 9) +#define SKX_CHA_MSR_PMON_BOX_FILTER_STATE (0x3ffULL << 17) +#define SKX_CHA_MSR_PMON_BOX_FILTER_REM (0x1ULL << 32) +#define SKX_CHA_MSR_PMON_BOX_FILTER_LOC (0x1ULL << 33) +#define SKX_CHA_MSR_PMON_BOX_FILTER_ALL_OPC (0x1ULL << 35) +#define SKX_CHA_MSR_PMON_BOX_FILTER_NM (0x1ULL << 36) +#define SKX_CHA_MSR_PMON_BOX_FILTER_NOT_NM (0x1ULL << 37) +#define SKX_CHA_MSR_PMON_BOX_FILTER_OPC0 (0x3ffULL << 41) +#define SKX_CHA_MSR_PMON_BOX_FILTER_OPC1 (0x3ffULL << 51) +#define SKX_CHA_MSR_PMON_BOX_FILTER_C6 (0x1ULL << 61) +#define SKX_CHA_MSR_PMON_BOX_FILTER_NC (0x1ULL << 62) +#define SKX_CHA_MSR_PMON_BOX_FILTER_ISOC (0x1ULL << 63) + +/* SKX IIO */ +#define SKX_IIO0_MSR_PMON_CTL0 0xa48 +#define SKX_IIO0_MSR_PMON_CTR0 0xa41 +#define SKX_IIO0_MSR_PMON_BOX_CTL 0xa40 +#define SKX_IIO_MSR_OFFSET 0x20 + +#define SKX_PMON_CTL_TRESH_MASK (0xff << 24) +#define SKX_PMON_CTL_TRESH_MASK_EXT (0xf) +#define SKX_PMON_CTL_CH_MASK (0xff << 4) +#define SKX_PMON_CTL_FC_MASK (0x7 << 12) +#define SKX_IIO_PMON_RAW_EVENT_MASK (SNBEP_PMON_CTL_EV_SEL_MASK | \ + SNBEP_PMON_CTL_UMASK_MASK | \ + SNBEP_PMON_CTL_EDGE_DET | \ + SNBEP_PMON_CTL_INVERT | \ + SKX_PMON_CTL_TRESH_MASK) +#define SKX_IIO_PMON_RAW_EVENT_MASK_EXT (SKX_PMON_CTL_TRESH_MASK_EXT | \ + SKX_PMON_CTL_CH_MASK | \ + SKX_PMON_CTL_FC_MASK) + +/* SKX IRP */ +#define SKX_IRP0_MSR_PMON_CTL0 0xa5b +#define SKX_IRP0_MSR_PMON_CTR0 0xa59 +#define SKX_IRP0_MSR_PMON_BOX_CTL 0xa58 +#define SKX_IRP_MSR_OFFSET 0x20 + +/* SKX UPI */ +#define SKX_UPI_PCI_PMON_CTL0 0x350 +#define SKX_UPI_PCI_PMON_CTR0 0x318 +#define SKX_UPI_PCI_PMON_BOX_CTL 0x378 +#define SKX_PMON_CTL_UMASK_EXT 0xff + +/* SKX M2M */ +#define SKX_M2M_PCI_PMON_CTL0 0x228 +#define SKX_M2M_PCI_PMON_CTR0 0x200 +#define SKX_M2M_PCI_PMON_BOX_CTL 0x258 + DEFINE_UNCORE_FORMAT_ATTR(event, event, "config:0-7"); DEFINE_UNCORE_FORMAT_ATTR(event2, event, "config:0-6"); DEFINE_UNCORE_FORMAT_ATTR(event_ext, event, "config:0-7,21"); DEFINE_UNCORE_FORMAT_ATTR(use_occ_ctr, use_occ_ctr, "config:7"); DEFINE_UNCORE_FORMAT_ATTR(umask, umask, "config:8-15"); +DEFINE_UNCORE_FORMAT_ATTR(umask_ext, umask, "config:8-15,32-39"); DEFINE_UNCORE_FORMAT_ATTR(qor, qor, "config:16"); DEFINE_UNCORE_FORMAT_ATTR(edge, edge, "config:18"); DEFINE_UNCORE_FORMAT_ATTR(tid_en, tid_en, "config:19"); DEFINE_UNCORE_FORMAT_ATTR(inv, inv, "config:23"); +DEFINE_UNCORE_FORMAT_ATTR(thresh9, thresh, "config:24-35"); DEFINE_UNCORE_FORMAT_ATTR(thresh8, thresh, "config:24-31"); DEFINE_UNCORE_FORMAT_ATTR(thresh6, thresh, "config:24-29"); DEFINE_UNCORE_FORMAT_ATTR(thresh5, thresh, "config:24-28"); @@ -280,6 +341,8 @@ DEFINE_UNCORE_FORMAT_ATTR(occ_sel, occ_sel, "config:14-15"); DEFINE_UNCORE_FORMAT_ATTR(occ_invert, occ_invert, "config:30"); DEFINE_UNCORE_FORMAT_ATTR(occ_edge, occ_edge, "config:14-51"); DEFINE_UNCORE_FORMAT_ATTR(occ_edge_det, occ_edge_det, "config:31"); +DEFINE_UNCORE_FORMAT_ATTR(ch_mask, ch_mask, "config:36-43"); +DEFINE_UNCORE_FORMAT_ATTR(fc_mask, fc_mask, "config:44-46"); DEFINE_UNCORE_FORMAT_ATTR(filter_tid, filter_tid, "config1:0-4"); DEFINE_UNCORE_FORMAT_ATTR(filter_tid2, filter_tid, "config1:0"); DEFINE_UNCORE_FORMAT_ATTR(filter_tid3, filter_tid, "config1:0-5"); @@ -288,18 +351,26 @@ DEFINE_UNCORE_FORMAT_ATTR(filter_cid, filter_cid, "config1:5"); DEFINE_UNCORE_FORMAT_ATTR(filter_link, filter_link, "config1:5-8"); DEFINE_UNCORE_FORMAT_ATTR(filter_link2, filter_link, "config1:6-8"); DEFINE_UNCORE_FORMAT_ATTR(filter_link3, filter_link, "config1:12"); +DEFINE_UNCORE_FORMAT_ATTR(filter_link4, filter_link, "config1:9-12"); DEFINE_UNCORE_FORMAT_ATTR(filter_nid, filter_nid, "config1:10-17"); DEFINE_UNCORE_FORMAT_ATTR(filter_nid2, filter_nid, "config1:32-47"); DEFINE_UNCORE_FORMAT_ATTR(filter_state, filter_state, "config1:18-22"); DEFINE_UNCORE_FORMAT_ATTR(filter_state2, filter_state, "config1:17-22"); DEFINE_UNCORE_FORMAT_ATTR(filter_state3, filter_state, "config1:17-23"); DEFINE_UNCORE_FORMAT_ATTR(filter_state4, filter_state, "config1:18-20"); +DEFINE_UNCORE_FORMAT_ATTR(filter_state5, filter_state, "config1:17-26"); +DEFINE_UNCORE_FORMAT_ATTR(filter_rem, filter_rem, "config1:32"); +DEFINE_UNCORE_FORMAT_ATTR(filter_loc, filter_loc, "config1:33"); +DEFINE_UNCORE_FORMAT_ATTR(filter_nm, filter_nm, "config1:36"); +DEFINE_UNCORE_FORMAT_ATTR(filter_not_nm, filter_not_nm, "config1:37"); DEFINE_UNCORE_FORMAT_ATTR(filter_local, filter_local, "config1:33"); DEFINE_UNCORE_FORMAT_ATTR(filter_all_op, filter_all_op, "config1:35"); DEFINE_UNCORE_FORMAT_ATTR(filter_nnm, filter_nnm, "config1:37"); DEFINE_UNCORE_FORMAT_ATTR(filter_opc, filter_opc, "config1:23-31"); DEFINE_UNCORE_FORMAT_ATTR(filter_opc2, filter_opc, "config1:52-60"); DEFINE_UNCORE_FORMAT_ATTR(filter_opc3, filter_opc, "config1:41-60"); +DEFINE_UNCORE_FORMAT_ATTR(filter_opc_0, filter_opc0, "config1:41-50"); +DEFINE_UNCORE_FORMAT_ATTR(filter_opc_1, filter_opc1, "config1:51-60"); DEFINE_UNCORE_FORMAT_ATTR(filter_nc, filter_nc, "config1:62"); DEFINE_UNCORE_FORMAT_ATTR(filter_c6, filter_c6, "config1:61"); DEFINE_UNCORE_FORMAT_ATTR(filter_isoc, filter_isoc, "config1:63"); @@ -1153,7 +1224,7 @@ static struct pci_driver snbep_uncore_pci_driver = { /* * build pci bus to socket mapping */ -static int snbep_pci2phy_map_init(int devid) +static int snbep_pci2phy_map_init(int devid, int nodeid_loc, int idmap_loc, bool reverse) { struct pci_dev *ubox_dev = NULL; int i, bus, nodeid, segment; @@ -1168,12 +1239,12 @@ static int snbep_pci2phy_map_init(int devid) break; bus = ubox_dev->bus->number; /* get the Node ID of the local register */ - err = pci_read_config_dword(ubox_dev, 0x40, &config); + err = pci_read_config_dword(ubox_dev, nodeid_loc, &config); if (err) break; nodeid = config; /* get the Node ID mapping */ - err = pci_read_config_dword(ubox_dev, 0x54, &config); + err = pci_read_config_dword(ubox_dev, idmap_loc, &config); if (err) break; @@ -1207,11 +1278,20 @@ static int snbep_pci2phy_map_init(int devid) raw_spin_lock(&pci2phy_map_lock); list_for_each_entry(map, &pci2phy_map_head, list) { i = -1; - for (bus = 255; bus >= 0; bus--) { - if (map->pbus_to_physid[bus] >= 0) - i = map->pbus_to_physid[bus]; - else - map->pbus_to_physid[bus] = i; + if (reverse) { + for (bus = 255; bus >= 0; bus--) { + if (map->pbus_to_physid[bus] >= 0) + i = map->pbus_to_physid[bus]; + else + map->pbus_to_physid[bus] = i; + } + } else { + for (bus = 0; bus <= 255; bus++) { + if (map->pbus_to_physid[bus] >= 0) + i = map->pbus_to_physid[bus]; + else + map->pbus_to_physid[bus] = i; + } } } raw_spin_unlock(&pci2phy_map_lock); @@ -1224,7 +1304,7 @@ static int snbep_pci2phy_map_init(int devid) int snbep_uncore_pci_init(void) { - int ret = snbep_pci2phy_map_init(0x3ce0); + int ret = snbep_pci2phy_map_init(0x3ce0, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true); if (ret) return ret; uncore_pci_uncores = snbep_pci_uncores; @@ -1788,7 +1868,7 @@ static struct pci_driver ivbep_uncore_pci_driver = { int ivbep_uncore_pci_init(void) { - int ret = snbep_pci2phy_map_init(0x0e1e); + int ret = snbep_pci2phy_map_init(0x0e1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true); if (ret) return ret; uncore_pci_uncores = ivbep_pci_uncores; @@ -2897,7 +2977,7 @@ static struct pci_driver hswep_uncore_pci_driver = { int hswep_uncore_pci_init(void) { - int ret = snbep_pci2phy_map_init(0x2f1e); + int ret = snbep_pci2phy_map_init(0x2f1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true); if (ret) return ret; uncore_pci_uncores = hswep_pci_uncores; @@ -3186,7 +3266,7 @@ static struct pci_driver bdx_uncore_pci_driver = { int bdx_uncore_pci_init(void) { - int ret = snbep_pci2phy_map_init(0x6f1e); + int ret = snbep_pci2phy_map_init(0x6f1e, SNBEP_CPUNODEID, SNBEP_GIDNIDMAP, true); if (ret) return ret; @@ -3196,3 +3276,525 @@ int bdx_uncore_pci_init(void) } /* end of BDX uncore support */ + +/* SKX uncore support */ + +static struct intel_uncore_type skx_uncore_ubox = { + .name = "ubox", + .num_counters = 2, + .num_boxes = 1, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .perf_ctr = HSWEP_U_MSR_PMON_CTR0, + .event_ctl = HSWEP_U_MSR_PMON_CTL0, + .event_mask = SNBEP_U_MSR_PMON_RAW_EVENT_MASK, + .fixed_ctr = HSWEP_U_MSR_PMON_UCLK_FIXED_CTR, + .fixed_ctl = HSWEP_U_MSR_PMON_UCLK_FIXED_CTL, + .ops = &ivbep_uncore_msr_ops, + .format_group = &ivbep_uncore_ubox_format_group, +}; + +static struct attribute *skx_uncore_cha_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_tid_en.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + &format_attr_filter_tid4.attr, + &format_attr_filter_link4.attr, + &format_attr_filter_state5.attr, + &format_attr_filter_rem.attr, + &format_attr_filter_loc.attr, + &format_attr_filter_nm.attr, + &format_attr_filter_all_op.attr, + &format_attr_filter_not_nm.attr, + &format_attr_filter_opc_0.attr, + &format_attr_filter_opc_1.attr, + &format_attr_filter_nc.attr, + &format_attr_filter_c6.attr, + &format_attr_filter_isoc.attr, + NULL, +}; + +static struct attribute_group skx_uncore_chabox_format_group = { + .name = "format", + .attrs = skx_uncore_cha_formats_attr, +}; + +static struct event_constraint skx_uncore_chabox_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x11, 0x1), + UNCORE_EVENT_CONSTRAINT(0x36, 0x1), + EVENT_CONSTRAINT_END +}; + +static struct extra_reg skx_uncore_cha_extra_regs[] = { + SNBEP_CBO_EVENT_EXTRA_REG(0x0334, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x0534, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x0934, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x1134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x2134, 0xffff, 0x4), + SNBEP_CBO_EVENT_EXTRA_REG(0x8134, 0xffff, 0x4), +}; + +static u64 skx_cha_filter_mask(int fields) +{ + u64 mask = 0; + + if (fields & 0x1) + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_TID; + if (fields & 0x2) + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_LINK; + if (fields & 0x4) + mask |= SKX_CHA_MSR_PMON_BOX_FILTER_STATE; + return mask; +} + +static struct event_constraint * +skx_cha_get_constraint(struct intel_uncore_box *box, struct perf_event *event) +{ + return __snbep_cbox_get_constraint(box, event, skx_cha_filter_mask); +} + +static int skx_cha_hw_config(struct intel_uncore_box *box, struct perf_event *event) +{ + struct hw_perf_event_extra *reg1 = &event->hw.extra_reg; + struct extra_reg *er; + int idx = 0; + + for (er = skx_uncore_cha_extra_regs; er->msr; er++) { + if (er->event != (event->hw.config & er->config_mask)) + continue; + idx |= er->idx; + } + + if (idx) { + reg1->reg = HSWEP_C0_MSR_PMON_BOX_FILTER0 + + HSWEP_CBO_MSR_OFFSET * box->pmu->pmu_idx; + reg1->config = event->attr.config1 & skx_cha_filter_mask(idx); + reg1->idx = idx; + } + return 0; +} + +static struct intel_uncore_ops skx_uncore_chabox_ops = { + /* There is no frz_en for chabox ctl */ + .init_box = ivbep_uncore_msr_init_box, + .disable_box = snbep_uncore_msr_disable_box, + .enable_box = snbep_uncore_msr_enable_box, + .disable_event = snbep_uncore_msr_disable_event, + .enable_event = hswep_cbox_enable_event, + .read_counter = uncore_msr_read_counter, + .hw_config = skx_cha_hw_config, + .get_constraint = skx_cha_get_constraint, + .put_constraint = snbep_cbox_put_constraint, +}; + +static struct intel_uncore_type skx_uncore_chabox = { + .name = "cha", + .num_counters = 4, + .perf_ctr_bits = 48, + .event_ctl = HSWEP_C0_MSR_PMON_CTL0, + .perf_ctr = HSWEP_C0_MSR_PMON_CTR0, + .event_mask = HSWEP_S_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = HSWEP_C0_MSR_PMON_BOX_CTL, + .msr_offset = HSWEP_CBO_MSR_OFFSET, + .num_shared_regs = 1, + .constraints = skx_uncore_chabox_constraints, + .ops = &skx_uncore_chabox_ops, + .format_group = &skx_uncore_chabox_format_group, +}; + +static struct attribute *skx_uncore_iio_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh9.attr, + &format_attr_ch_mask.attr, + &format_attr_fc_mask.attr, + NULL, +}; + +static struct attribute_group skx_uncore_iio_format_group = { + .name = "format", + .attrs = skx_uncore_iio_formats_attr, +}; + +static struct event_constraint skx_uncore_iio_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x83, 0x3), + UNCORE_EVENT_CONSTRAINT(0x88, 0xc), + UNCORE_EVENT_CONSTRAINT(0x95, 0xc), + UNCORE_EVENT_CONSTRAINT(0xc0, 0xc), + UNCORE_EVENT_CONSTRAINT(0xc5, 0xc), + UNCORE_EVENT_CONSTRAINT(0xd4, 0xc), + EVENT_CONSTRAINT_END +}; + +static void skx_iio_enable_event(struct intel_uncore_box *box, + struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + + wrmsrl(hwc->config_base, hwc->config | SNBEP_PMON_CTL_EN); +} + +static struct intel_uncore_ops skx_uncore_iio_ops = { + .init_box = ivbep_uncore_msr_init_box, + .disable_box = snbep_uncore_msr_disable_box, + .enable_box = snbep_uncore_msr_enable_box, + .disable_event = snbep_uncore_msr_disable_event, + .enable_event = skx_iio_enable_event, + .read_counter = uncore_msr_read_counter, +}; + +static struct intel_uncore_type skx_uncore_iio = { + .name = "iio", + .num_counters = 4, + .num_boxes = 5, + .perf_ctr_bits = 48, + .event_ctl = SKX_IIO0_MSR_PMON_CTL0, + .perf_ctr = SKX_IIO0_MSR_PMON_CTR0, + .event_mask = SKX_IIO_PMON_RAW_EVENT_MASK, + .event_mask_ext = SKX_IIO_PMON_RAW_EVENT_MASK_EXT, + .box_ctl = SKX_IIO0_MSR_PMON_BOX_CTL, + .msr_offset = SKX_IIO_MSR_OFFSET, + .constraints = skx_uncore_iio_constraints, + .ops = &skx_uncore_iio_ops, + .format_group = &skx_uncore_iio_format_group, +}; + +static struct attribute *skx_uncore_formats_attr[] = { + &format_attr_event.attr, + &format_attr_umask.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute_group skx_uncore_format_group = { + .name = "format", + .attrs = skx_uncore_formats_attr, +}; + +static struct intel_uncore_type skx_uncore_irp = { + .name = "irp", + .num_counters = 2, + .num_boxes = 5, + .perf_ctr_bits = 48, + .event_ctl = SKX_IRP0_MSR_PMON_CTL0, + .perf_ctr = SKX_IRP0_MSR_PMON_CTR0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SKX_IRP0_MSR_PMON_BOX_CTL, + .msr_offset = SKX_IRP_MSR_OFFSET, + .ops = &skx_uncore_iio_ops, + .format_group = &skx_uncore_format_group, +}; + +static struct intel_uncore_ops skx_uncore_pcu_ops = { + IVBEP_UNCORE_MSR_OPS_COMMON_INIT(), + .hw_config = hswep_pcu_hw_config, + .get_constraint = snbep_pcu_get_constraint, + .put_constraint = snbep_pcu_put_constraint, +}; + +static struct intel_uncore_type skx_uncore_pcu = { + .name = "pcu", + .num_counters = 4, + .num_boxes = 1, + .perf_ctr_bits = 48, + .perf_ctr = HSWEP_PCU_MSR_PMON_CTR0, + .event_ctl = HSWEP_PCU_MSR_PMON_CTL0, + .event_mask = SNBEP_PCU_MSR_PMON_RAW_EVENT_MASK, + .box_ctl = HSWEP_PCU_MSR_PMON_BOX_CTL, + .num_shared_regs = 1, + .ops = &skx_uncore_pcu_ops, + .format_group = &snbep_uncore_pcu_format_group, +}; + +static struct intel_uncore_type *skx_msr_uncores[] = { + &skx_uncore_ubox, + &skx_uncore_chabox, + &skx_uncore_iio, + &skx_uncore_irp, + &skx_uncore_pcu, + NULL, +}; + +static int skx_count_chabox(void) +{ + struct pci_dev *chabox_dev = NULL; + int bus, count = 0; + + while (1) { + chabox_dev = pci_get_device(PCI_VENDOR_ID_INTEL, 0x208d, chabox_dev); + if (!chabox_dev) + break; + if (count == 0) + bus = chabox_dev->bus->number; + if (bus != chabox_dev->bus->number) + break; + count++; + } + + pci_dev_put(chabox_dev); + return count; +} + +void skx_uncore_cpu_init(void) +{ + skx_uncore_chabox.num_boxes = skx_count_chabox(); + uncore_msr_uncores = skx_msr_uncores; +} + +static struct intel_uncore_type skx_uncore_imc = { + .name = "imc", + .num_counters = 4, + .num_boxes = 6, + .perf_ctr_bits = 48, + .fixed_ctr_bits = 48, + .fixed_ctr = SNBEP_MC_CHy_PCI_PMON_FIXED_CTR, + .fixed_ctl = SNBEP_MC_CHy_PCI_PMON_FIXED_CTL, + .event_descs = hswep_uncore_imc_events, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .ops = &ivbep_uncore_pci_ops, + .format_group = &skx_uncore_format_group, +}; + +static struct attribute *skx_upi_uncore_formats_attr[] = { + &format_attr_event_ext.attr, + &format_attr_umask_ext.attr, + &format_attr_edge.attr, + &format_attr_inv.attr, + &format_attr_thresh8.attr, + NULL, +}; + +static struct attribute_group skx_upi_uncore_format_group = { + .name = "format", + .attrs = skx_upi_uncore_formats_attr, +}; + +static void skx_upi_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + + __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags); + pci_write_config_dword(pdev, SKX_UPI_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT); +} + +static struct intel_uncore_ops skx_upi_uncore_pci_ops = { + .init_box = skx_upi_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_uncore_pci_enable_event, + .read_counter = snbep_uncore_pci_read_counter, +}; + +static struct intel_uncore_type skx_uncore_upi = { + .name = "upi", + .num_counters = 4, + .num_boxes = 3, + .perf_ctr_bits = 48, + .perf_ctr = SKX_UPI_PCI_PMON_CTR0, + .event_ctl = SKX_UPI_PCI_PMON_CTL0, + .event_mask = SNBEP_QPI_PCI_PMON_RAW_EVENT_MASK, + .event_mask_ext = SKX_PMON_CTL_UMASK_EXT, + .box_ctl = SKX_UPI_PCI_PMON_BOX_CTL, + .ops = &skx_upi_uncore_pci_ops, + .format_group = &skx_upi_uncore_format_group, +}; + +static void skx_m2m_uncore_pci_init_box(struct intel_uncore_box *box) +{ + struct pci_dev *pdev = box->pci_dev; + + __set_bit(UNCORE_BOX_FLAG_CTL_OFFS8, &box->flags); + pci_write_config_dword(pdev, SKX_M2M_PCI_PMON_BOX_CTL, IVBEP_PMON_BOX_CTL_INT); +} + +static struct intel_uncore_ops skx_m2m_uncore_pci_ops = { + .init_box = skx_m2m_uncore_pci_init_box, + .disable_box = snbep_uncore_pci_disable_box, + .enable_box = snbep_uncore_pci_enable_box, + .disable_event = snbep_uncore_pci_disable_event, + .enable_event = snbep_uncore_pci_enable_event, + .read_counter = snbep_uncore_pci_read_counter, +}; + +static struct intel_uncore_type skx_uncore_m2m = { + .name = "m2m", + .num_counters = 4, + .num_boxes = 2, + .perf_ctr_bits = 48, + .perf_ctr = SKX_M2M_PCI_PMON_CTR0, + .event_ctl = SKX_M2M_PCI_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SKX_M2M_PCI_PMON_BOX_CTL, + .ops = &skx_m2m_uncore_pci_ops, + .format_group = &skx_uncore_format_group, +}; + +static struct event_constraint skx_uncore_m2pcie_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x23, 0x3), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type skx_uncore_m2pcie = { + .name = "m2pcie", + .num_counters = 4, + .num_boxes = 4, + .perf_ctr_bits = 48, + .constraints = skx_uncore_m2pcie_constraints, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .ops = &ivbep_uncore_pci_ops, + .format_group = &skx_uncore_format_group, +}; + +static struct event_constraint skx_uncore_m3upi_constraints[] = { + UNCORE_EVENT_CONSTRAINT(0x1d, 0x1), + UNCORE_EVENT_CONSTRAINT(0x1e, 0x1), + UNCORE_EVENT_CONSTRAINT(0x40, 0x7), + UNCORE_EVENT_CONSTRAINT(0x4e, 0x7), + UNCORE_EVENT_CONSTRAINT(0x4f, 0x7), + UNCORE_EVENT_CONSTRAINT(0x50, 0x7), + UNCORE_EVENT_CONSTRAINT(0x51, 0x7), + UNCORE_EVENT_CONSTRAINT(0x52, 0x7), + EVENT_CONSTRAINT_END +}; + +static struct intel_uncore_type skx_uncore_m3upi = { + .name = "m3upi", + .num_counters = 3, + .num_boxes = 3, + .perf_ctr_bits = 48, + .constraints = skx_uncore_m3upi_constraints, + .perf_ctr = SNBEP_PCI_PMON_CTR0, + .event_ctl = SNBEP_PCI_PMON_CTL0, + .event_mask = SNBEP_PMON_RAW_EVENT_MASK, + .box_ctl = SNBEP_PCI_PMON_BOX_CTL, + .ops = &ivbep_uncore_pci_ops, + .format_group = &skx_uncore_format_group, +}; + +enum { + SKX_PCI_UNCORE_IMC, + SKX_PCI_UNCORE_M2M, + SKX_PCI_UNCORE_UPI, + SKX_PCI_UNCORE_M2PCIE, + SKX_PCI_UNCORE_M3UPI, +}; + +static struct intel_uncore_type *skx_pci_uncores[] = { + [SKX_PCI_UNCORE_IMC] = &skx_uncore_imc, + [SKX_PCI_UNCORE_M2M] = &skx_uncore_m2m, + [SKX_PCI_UNCORE_UPI] = &skx_uncore_upi, + [SKX_PCI_UNCORE_M2PCIE] = &skx_uncore_m2pcie, + [SKX_PCI_UNCORE_M3UPI] = &skx_uncore_m3upi, + NULL, +}; + +static const struct pci_device_id skx_uncore_pci_ids[] = { + { /* MC0 Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2042), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 2, SKX_PCI_UNCORE_IMC, 0), + }, + { /* MC0 Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2046), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(10, 6, SKX_PCI_UNCORE_IMC, 1), + }, + { /* MC0 Channel 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204a), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(11, 2, SKX_PCI_UNCORE_IMC, 2), + }, + { /* MC1 Channel 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2042), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 2, SKX_PCI_UNCORE_IMC, 3), + }, + { /* MC1 Channel 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2046), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(12, 6, SKX_PCI_UNCORE_IMC, 4), + }, + { /* MC1 Channel 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204a), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(13, 2, SKX_PCI_UNCORE_IMC, 5), + }, + { /* M2M0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2066), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(8, 0, SKX_PCI_UNCORE_M2M, 0), + }, + { /* M2M1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2066), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(9, 0, SKX_PCI_UNCORE_M2M, 1), + }, + { /* UPI0 Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(14, 0, SKX_PCI_UNCORE_UPI, 0), + }, + { /* UPI0 Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(15, 0, SKX_PCI_UNCORE_UPI, 1), + }, + { /* UPI1 Link 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2058), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(16, 0, SKX_PCI_UNCORE_UPI, 2), + }, + { /* M2PCIe 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 1, SKX_PCI_UNCORE_M2PCIE, 0), + }, + { /* M2PCIe 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(22, 1, SKX_PCI_UNCORE_M2PCIE, 1), + }, + { /* M2PCIe 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(23, 1, SKX_PCI_UNCORE_M2PCIE, 2), + }, + { /* M2PCIe 3 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x2088), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(21, 5, SKX_PCI_UNCORE_M2PCIE, 3), + }, + { /* M3UPI0 Link 0 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 0, SKX_PCI_UNCORE_M3UPI, 0), + }, + { /* M3UPI0 Link 1 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204D), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 1, SKX_PCI_UNCORE_M3UPI, 1), + }, + { /* M3UPI1 Link 2 */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x204C), + .driver_data = UNCORE_PCI_DEV_FULL_DATA(18, 4, SKX_PCI_UNCORE_M3UPI, 2), + }, + { /* end: all zeroes */ } +}; + + +static struct pci_driver skx_uncore_pci_driver = { + .name = "skx_uncore", + .id_table = skx_uncore_pci_ids, +}; + +int skx_uncore_pci_init(void) +{ + /* need to double check pci address */ + int ret = snbep_pci2phy_map_init(0x2014, SKX_CPUNODEID, SKX_GIDNIDMAP, false); + + if (ret) + return ret; + + uncore_pci_uncores = skx_pci_uncores; + uncore_pci_driver = &skx_uncore_pci_driver; + return 0; +} + +/* end of SKX uncore support */ diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 8c4a477..5874d8d 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -194,12 +194,13 @@ struct cpu_hw_events { */ struct debug_store *ds; u64 pebs_enabled; + int n_pebs; + int n_large_pebs; /* * Intel LBR bits */ int lbr_users; - void *lbr_context; struct perf_branch_stack lbr_stack; struct perf_branch_entry lbr_entries[MAX_LBR_ENTRIES]; struct er_account *lbr_sel; @@ -508,6 +509,8 @@ struct x86_pmu { void (*enable_all)(int added); void (*enable)(struct perf_event *); void (*disable)(struct perf_event *); + void (*add)(struct perf_event *); + void (*del)(struct perf_event *); int (*hw_config)(struct perf_event *event); int (*schedule_events)(struct cpu_hw_events *cpuc, int n, int *assign); unsigned eventsel; @@ -888,6 +891,10 @@ extern struct event_constraint intel_skl_pebs_event_constraints[]; struct event_constraint *intel_pebs_constraints(struct perf_event *event); +void intel_pmu_pebs_add(struct perf_event *event); + +void intel_pmu_pebs_del(struct perf_event *event); + void intel_pmu_pebs_enable(struct perf_event *event); void intel_pmu_pebs_disable(struct perf_event *event); @@ -906,9 +913,9 @@ u64 lbr_from_signext_quirk_wr(u64 val); void intel_pmu_lbr_reset(void); -void intel_pmu_lbr_enable(struct perf_event *event); +void intel_pmu_lbr_add(struct perf_event *event); -void intel_pmu_lbr_disable(struct perf_event *event); +void intel_pmu_lbr_del(struct perf_event *event); void intel_pmu_lbr_enable_all(bool pmi); diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 9733361..97848cd 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -158,53 +158,9 @@ extern void __add_wrong_size(void) * value of "*ptr". * * xadd() is locked when multiple CPUs are online - * xadd_sync() is always locked - * xadd_local() is never locked */ #define __xadd(ptr, inc, lock) __xchg_op((ptr), (inc), xadd, lock) #define xadd(ptr, inc) __xadd((ptr), (inc), LOCK_PREFIX) -#define xadd_sync(ptr, inc) __xadd((ptr), (inc), "lock; ") -#define xadd_local(ptr, inc) __xadd((ptr), (inc), "") - -#define __add(ptr, inc, lock) \ - ({ \ - __typeof__ (*(ptr)) __ret = (inc); \ - switch (sizeof(*(ptr))) { \ - case __X86_CASE_B: \ - asm volatile (lock "addb %b1, %0\n" \ - : "+m" (*(ptr)) : "qi" (inc) \ - : "memory", "cc"); \ - break; \ - case __X86_CASE_W: \ - asm volatile (lock "addw %w1, %0\n" \ - : "+m" (*(ptr)) : "ri" (inc) \ - : "memory", "cc"); \ - break; \ - case __X86_CASE_L: \ - asm volatile (lock "addl %1, %0\n" \ - : "+m" (*(ptr)) : "ri" (inc) \ - : "memory", "cc"); \ - break; \ - case __X86_CASE_Q: \ - asm volatile (lock "addq %1, %0\n" \ - : "+m" (*(ptr)) : "ri" (inc) \ - : "memory", "cc"); \ - break; \ - default: \ - __add_wrong_size(); \ - } \ - __ret; \ - }) - -/* - * add_*() adds "inc" to "*ptr" - * - * __add() takes a lock prefix - * add_smp() is locked when multiple CPUs are online - * add_sync() is always locked - */ -#define add_smp(ptr, inc) __add((ptr), (inc), LOCK_PREFIX) -#define add_sync(ptr, inc) __add((ptr), (inc), "lock; ") #define __cmpxchg_double(pfx, p1, p2, o1, o2, n1, n2) \ ({ \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 92a8308..1188bc8 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -106,7 +106,6 @@ #define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ #define X86_FEATURE_EAGER_FPU ( 3*32+29) /* "eagerfpu" Non lazy FPU restore */ #define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ -#define X86_FEATURE_MCE_RECOVERY ( 3*32+31) /* cpu has recoverable machine checks */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ diff --git a/arch/x86/include/asm/efi.h b/arch/x86/include/asm/efi.h index d0bb76d..389d700 100644 --- a/arch/x86/include/asm/efi.h +++ b/arch/x86/include/asm/efi.h @@ -117,7 +117,6 @@ extern int __init efi_memblock_x86_reserve_range(void); extern pgd_t * __init efi_call_phys_prolog(void); extern void __init efi_call_phys_epilog(pgd_t *save_pgd); extern void __init efi_print_memmap(void); -extern void __init efi_unmap_memmap(void); extern void __init efi_memory_uc(u64 addr, unsigned long size); extern void __init efi_map_region(efi_memory_desc_t *md); extern void __init efi_map_region_fixed(efi_memory_desc_t *md); @@ -192,14 +191,7 @@ static inline efi_status_t efi_thunk_set_virtual_address_map( struct efi_config { u64 image_handle; u64 table; - u64 allocate_pool; - u64 allocate_pages; - u64 get_memory_map; - u64 free_pool; - u64 free_pages; - u64 locate_handle; - u64 handle_protocol; - u64 exit_boot_services; + u64 boot_services; u64 text_output; efi_status_t (*call)(unsigned long, ...); bool is64; @@ -207,14 +199,27 @@ struct efi_config { __pure const struct efi_config *__efi_early(void); +static inline bool efi_is_64bit(void) +{ + if (!IS_ENABLED(CONFIG_X86_64)) + return false; + + if (!IS_ENABLED(CONFIG_EFI_MIXED)) + return true; + + return __efi_early()->is64; +} + #define efi_call_early(f, ...) \ - __efi_early()->call(__efi_early()->f, __VA_ARGS__); + __efi_early()->call(efi_is_64bit() ? \ + ((efi_boot_services_64_t *)(unsigned long) \ + __efi_early()->boot_services)->f : \ + ((efi_boot_services_32_t *)(unsigned long) \ + __efi_early()->boot_services)->f, __VA_ARGS__) #define __efi_call_early(f, ...) \ __efi_early()->call((unsigned long)f, __VA_ARGS__); -#define efi_is_64bit() __efi_early()->is64 - extern bool efi_reboot_required(void); #else diff --git a/arch/x86/include/asm/hypervisor.h b/arch/x86/include/asm/hypervisor.h index 055ea99..67942b6 100644 --- a/arch/x86/include/asm/hypervisor.h +++ b/arch/x86/include/asm/hypervisor.h @@ -43,6 +43,9 @@ struct hypervisor_x86 { /* X2APIC detection (run once per boot) */ bool (*x2apic_available)(void); + + /* pin current vcpu to specified physical cpu (run rarely) */ + void (*pin_vcpu)(int); }; extern const struct hypervisor_x86 *x86_hyper; @@ -56,6 +59,7 @@ extern const struct hypervisor_x86 x86_hyper_kvm; extern void init_hypervisor(struct cpuinfo_x86 *c); extern void init_hypervisor_platform(void); extern bool hypervisor_x2apic_available(void); +extern void hypervisor_pin_vcpu(int cpu); #else static inline void init_hypervisor(struct cpuinfo_x86 *c) { } static inline void init_hypervisor_platform(void) { } diff --git a/arch/x86/include/asm/mce.h b/arch/x86/include/asm/mce.h index 8bf766e..9bd7ff5 100644 --- a/arch/x86/include/asm/mce.h +++ b/arch/x86/include/asm/mce.h @@ -40,9 +40,10 @@ #define MCI_STATUS_AR (1ULL<<55) /* Action required */ /* AMD-specific bits */ +#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */ +#define MCI_STATUS_SYNDV (1ULL<<53) /* synd reg. valid */ #define MCI_STATUS_DEFERRED (1ULL<<44) /* uncorrected error, deferred exception */ #define MCI_STATUS_POISON (1ULL<<43) /* access poisonous data */ -#define MCI_STATUS_TCC (1ULL<<55) /* Task context corrupt */ /* * McaX field if set indicates a given bank supports MCA extensions: @@ -110,6 +111,7 @@ #define MSR_AMD64_SMCA_MC0_MISC0 0xc0002003 #define MSR_AMD64_SMCA_MC0_CONFIG 0xc0002004 #define MSR_AMD64_SMCA_MC0_IPID 0xc0002005 +#define MSR_AMD64_SMCA_MC0_SYND 0xc0002006 #define MSR_AMD64_SMCA_MC0_DESTAT 0xc0002008 #define MSR_AMD64_SMCA_MC0_DEADDR 0xc0002009 #define MSR_AMD64_SMCA_MC0_MISC1 0xc000200a @@ -119,6 +121,7 @@ #define MSR_AMD64_SMCA_MCx_MISC(x) (MSR_AMD64_SMCA_MC0_MISC0 + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_CONFIG(x) (MSR_AMD64_SMCA_MC0_CONFIG + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_IPID(x) (MSR_AMD64_SMCA_MC0_IPID + 0x10*(x)) +#define MSR_AMD64_SMCA_MCx_SYND(x) (MSR_AMD64_SMCA_MC0_SYND + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_DESTAT(x) (MSR_AMD64_SMCA_MC0_DESTAT + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_DEADDR(x) (MSR_AMD64_SMCA_MC0_DEADDR + 0x10*(x)) #define MSR_AMD64_SMCA_MCx_MISCy(x, y) ((MSR_AMD64_SMCA_MC0_MISC1 + y) + (0x10*(x))) @@ -334,44 +337,47 @@ extern void apei_mce_report_mem_error(int corrected, * Scalable MCA. */ #ifdef CONFIG_X86_MCE_AMD -enum amd_ip_types { - SMCA_F17H_CORE = 0, /* Core errors */ - SMCA_DF, /* Data Fabric */ - SMCA_UMC, /* Unified Memory Controller */ - SMCA_PB, /* Parameter Block */ - SMCA_PSP, /* Platform Security Processor */ - SMCA_SMU, /* System Management Unit */ - N_AMD_IP_TYPES -}; - -struct amd_hwid { - const char *name; - unsigned int hwid; -}; - -extern struct amd_hwid amd_hwids[N_AMD_IP_TYPES]; -enum amd_core_mca_blocks { +/* These may be used by multiple smca_hwid_mcatypes */ +enum smca_bank_types { SMCA_LS = 0, /* Load Store */ SMCA_IF, /* Instruction Fetch */ - SMCA_L2_CACHE, /* L2 cache */ - SMCA_DE, /* Decoder unit */ - RES, /* Reserved */ - SMCA_EX, /* Execution unit */ + SMCA_L2_CACHE, /* L2 Cache */ + SMCA_DE, /* Decoder Unit */ + SMCA_EX, /* Execution Unit */ SMCA_FP, /* Floating Point */ - SMCA_L3_CACHE, /* L3 cache */ - N_CORE_MCA_BLOCKS + SMCA_L3_CACHE, /* L3 Cache */ + SMCA_CS, /* Coherent Slave */ + SMCA_PIE, /* Power, Interrupts, etc. */ + SMCA_UMC, /* Unified Memory Controller */ + SMCA_PB, /* Parameter Block */ + SMCA_PSP, /* Platform Security Processor */ + SMCA_SMU, /* System Management Unit */ + N_SMCA_BANK_TYPES }; -extern const char * const amd_core_mcablock_names[N_CORE_MCA_BLOCKS]; +struct smca_bank_name { + const char *name; /* Short name for sysfs */ + const char *long_name; /* Long name for pretty-printing */ +}; + +extern struct smca_bank_name smca_bank_names[N_SMCA_BANK_TYPES]; + +#define HWID_MCATYPE(hwid, mcatype) ((hwid << 16) | mcatype) -enum amd_df_mca_blocks { - SMCA_CS = 0, /* Coherent Slave */ - SMCA_PIE, /* Power management, Interrupts, etc */ - N_DF_BLOCKS +struct smca_hwid_mcatype { + unsigned int bank_type; /* Use with smca_bank_types for easy indexing. */ + u32 hwid_mcatype; /* (hwid,mcatype) tuple */ + u32 xec_bitmap; /* Bitmap of valid ExtErrorCodes; current max is 21. */ }; -extern const char * const amd_df_mcablock_names[N_DF_BLOCKS]; +struct smca_bank_info { + struct smca_hwid_mcatype *type; + u32 type_instance; +}; + +extern struct smca_bank_info smca_banks[MAX_NR_BANKS]; + #endif #endif /* _ASM_X86_MCE_H */ diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 2970d22..4cd8db0 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -661,8 +661,6 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) -#ifdef CONFIG_QUEUED_SPINLOCKS - static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) { @@ -684,22 +682,6 @@ static __always_inline void pv_kick(int cpu) PVOP_VCALL1(pv_lock_ops.kick, cpu); } -#else /* !CONFIG_QUEUED_SPINLOCKS */ - -static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, - __ticket_t ticket) -{ - PVOP_VCALLEE2(pv_lock_ops.lock_spinning, lock, ticket); -} - -static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, - __ticket_t ticket) -{ - PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket); -} - -#endif /* CONFIG_QUEUED_SPINLOCKS */ - #endif /* SMP && PARAVIRT_SPINLOCKS */ #ifdef CONFIG_X86_32 diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 7fa9e77..60aac60 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -301,23 +301,16 @@ struct pv_mmu_ops { struct arch_spinlock; #ifdef CONFIG_SMP #include <asm/spinlock_types.h> -#else -typedef u16 __ticket_t; #endif struct qspinlock; struct pv_lock_ops { -#ifdef CONFIG_QUEUED_SPINLOCKS void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val); struct paravirt_callee_save queued_spin_unlock; void (*wait)(u8 *ptr, u8 val); void (*kick)(int cpu); -#else /* !CONFIG_QUEUED_SPINLOCKS */ - struct paravirt_callee_save lock_spinning; - void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket); -#endif /* !CONFIG_QUEUED_SPINLOCKS */ }; /* This contains all the paravirt structures: we get a convenient diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index 643eba4..2c1ebeb 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -46,10 +46,7 @@ static inline void arch_memcpy_to_pmem(void *dst, const void *src, size_t n) static inline int arch_memcpy_from_pmem(void *dst, const void *src, size_t n) { - if (static_cpu_has(X86_FEATURE_MCE_RECOVERY)) - return memcpy_mcsafe(dst, src, n); - memcpy(dst, src, n); - return 0; + return memcpy_mcsafe(dst, src, n); } /** diff --git a/arch/x86/include/asm/rwsem.h b/arch/x86/include/asm/rwsem.h index 8dbc762..3d33a71 100644 --- a/arch/x86/include/asm/rwsem.h +++ b/arch/x86/include/asm/rwsem.h @@ -154,7 +154,7 @@ static inline bool __down_write_trylock(struct rw_semaphore *sem) : "+m" (sem->count), "=&a" (tmp0), "=&r" (tmp1), CC_OUT(e) (result) : "er" (RWSEM_ACTIVE_WRITE_BIAS) - : "memory", "cc"); + : "memory"); return result; } diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index be0a059..921bea7 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -20,187 +20,13 @@ * (the type definitions are in asm/spinlock_types.h) */ -#ifdef CONFIG_X86_32 -# define LOCK_PTR_REG "a" -#else -# define LOCK_PTR_REG "D" -#endif - -#if defined(CONFIG_X86_32) && (defined(CONFIG_X86_PPRO_FENCE)) -/* - * On PPro SMP, we use a locked operation to unlock - * (PPro errata 66, 92) - */ -# define UNLOCK_LOCK_PREFIX LOCK_PREFIX -#else -# define UNLOCK_LOCK_PREFIX -#endif - /* How long a lock should spin before we consider blocking */ #define SPIN_THRESHOLD (1 << 15) extern struct static_key paravirt_ticketlocks_enabled; static __always_inline bool static_key_false(struct static_key *key); -#ifdef CONFIG_QUEUED_SPINLOCKS #include <asm/qspinlock.h> -#else - -#ifdef CONFIG_PARAVIRT_SPINLOCKS - -static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) -{ - set_bit(0, (volatile unsigned long *)&lock->tickets.head); -} - -#else /* !CONFIG_PARAVIRT_SPINLOCKS */ -static __always_inline void __ticket_lock_spinning(arch_spinlock_t *lock, - __ticket_t ticket) -{ -} -static inline void __ticket_unlock_kick(arch_spinlock_t *lock, - __ticket_t ticket) -{ -} - -#endif /* CONFIG_PARAVIRT_SPINLOCKS */ -static inline int __tickets_equal(__ticket_t one, __ticket_t two) -{ - return !((one ^ two) & ~TICKET_SLOWPATH_FLAG); -} - -static inline void __ticket_check_and_clear_slowpath(arch_spinlock_t *lock, - __ticket_t head) -{ - if (head & TICKET_SLOWPATH_FLAG) { - arch_spinlock_t old, new; - - old.tickets.head = head; - new.tickets.head = head & ~TICKET_SLOWPATH_FLAG; - old.tickets.tail = new.tickets.head + TICKET_LOCK_INC; - new.tickets.tail = old.tickets.tail; - - /* try to clear slowpath flag when there are no contenders */ - cmpxchg(&lock->head_tail, old.head_tail, new.head_tail); - } -} - -static __always_inline int arch_spin_value_unlocked(arch_spinlock_t lock) -{ - return __tickets_equal(lock.tickets.head, lock.tickets.tail); -} - -/* - * Ticket locks are conceptually two parts, one indicating the current head of - * the queue, and the other indicating the current tail. The lock is acquired - * by atomically noting the tail and incrementing it by one (thus adding - * ourself to the queue and noting our position), then waiting until the head - * becomes equal to the the initial value of the tail. - * - * We use an xadd covering *both* parts of the lock, to increment the tail and - * also load the position of the head, which takes care of memory ordering - * issues and should be optimal for the uncontended case. Note the tail must be - * in the high part, because a wide xadd increment of the low part would carry - * up and contaminate the high part. - */ -static __always_inline void arch_spin_lock(arch_spinlock_t *lock) -{ - register struct __raw_tickets inc = { .tail = TICKET_LOCK_INC }; - - inc = xadd(&lock->tickets, inc); - if (likely(inc.head == inc.tail)) - goto out; - - for (;;) { - unsigned count = SPIN_THRESHOLD; - - do { - inc.head = READ_ONCE(lock->tickets.head); - if (__tickets_equal(inc.head, inc.tail)) - goto clear_slowpath; - cpu_relax(); - } while (--count); - __ticket_lock_spinning(lock, inc.tail); - } -clear_slowpath: - __ticket_check_and_clear_slowpath(lock, inc.head); -out: - barrier(); /* make sure nothing creeps before the lock is taken */ -} - -static __always_inline int arch_spin_trylock(arch_spinlock_t *lock) -{ - arch_spinlock_t old, new; - - old.tickets = READ_ONCE(lock->tickets); - if (!__tickets_equal(old.tickets.head, old.tickets.tail)) - return 0; - - new.head_tail = old.head_tail + (TICKET_LOCK_INC << TICKET_SHIFT); - new.head_tail &= ~TICKET_SLOWPATH_FLAG; - - /* cmpxchg is a full barrier, so nothing can move before it */ - return cmpxchg(&lock->head_tail, old.head_tail, new.head_tail) == old.head_tail; -} - -static __always_inline void arch_spin_unlock(arch_spinlock_t *lock) -{ - if (TICKET_SLOWPATH_FLAG && - static_key_false(¶virt_ticketlocks_enabled)) { - __ticket_t head; - - BUILD_BUG_ON(((__ticket_t)NR_CPUS) != NR_CPUS); - - head = xadd(&lock->tickets.head, TICKET_LOCK_INC); - - if (unlikely(head & TICKET_SLOWPATH_FLAG)) { - head &= ~TICKET_SLOWPATH_FLAG; - __ticket_unlock_kick(lock, (head + TICKET_LOCK_INC)); - } - } else - __add(&lock->tickets.head, TICKET_LOCK_INC, UNLOCK_LOCK_PREFIX); -} - -static inline int arch_spin_is_locked(arch_spinlock_t *lock) -{ - struct __raw_tickets tmp = READ_ONCE(lock->tickets); - - return !__tickets_equal(tmp.tail, tmp.head); -} - -static inline int arch_spin_is_contended(arch_spinlock_t *lock) -{ - struct __raw_tickets tmp = READ_ONCE(lock->tickets); - - tmp.head &= ~TICKET_SLOWPATH_FLAG; - return (__ticket_t)(tmp.tail - tmp.head) > TICKET_LOCK_INC; -} -#define arch_spin_is_contended arch_spin_is_contended - -static __always_inline void arch_spin_lock_flags(arch_spinlock_t *lock, - unsigned long flags) -{ - arch_spin_lock(lock); -} - -static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) -{ - __ticket_t head = READ_ONCE(lock->tickets.head); - - for (;;) { - struct __raw_tickets tmp = READ_ONCE(lock->tickets); - /* - * We need to check "unlocked" in a loop, tmp.head == head - * can be false positive because of overflow. - */ - if (__tickets_equal(tmp.head, tmp.tail) || - !__tickets_equal(tmp.head, head)) - break; - - cpu_relax(); - } -} -#endif /* CONFIG_QUEUED_SPINLOCKS */ /* * Read-write spinlocks, allowing multiple readers diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 65c3e37..25311eb 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -23,20 +23,7 @@ typedef u32 __ticketpair_t; #define TICKET_SHIFT (sizeof(__ticket_t) * 8) -#ifdef CONFIG_QUEUED_SPINLOCKS #include <asm-generic/qspinlock_types.h> -#else -typedef struct arch_spinlock { - union { - __ticketpair_t head_tail; - struct __raw_tickets { - __ticket_t head, tail; - } tickets; - }; -} arch_spinlock_t; - -#define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } -#endif /* CONFIG_QUEUED_SPINLOCKS */ #include <asm-generic/qrwlock_types.h> diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 90dbbd9..a164862 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -2,6 +2,7 @@ #define _ASM_X86_STRING_64_H #ifdef __KERNEL__ +#include <linux/jump_label.h> /* Written 2002 by Andi Kleen */ @@ -78,6 +79,9 @@ int strcmp(const char *cs, const char *ct); #define memset(s, c, n) __memset(s, c, n) #endif +__must_check int memcpy_mcsafe_unrolled(void *dst, const void *src, size_t cnt); +DECLARE_STATIC_KEY_FALSE(mcsafe_key); + /** * memcpy_mcsafe - copy memory with indication if a machine check happened * @@ -86,10 +90,23 @@ int strcmp(const char *cs, const char *ct); * @cnt: number of bytes to copy * * Low level memory copy function that catches machine checks + * We only call into the "safe" function on systems that can + * actually do machine check recovery. Everyone else can just + * use memcpy(). * * Return 0 for success, -EFAULT for fail */ -int memcpy_mcsafe(void *dst, const void *src, size_t cnt); +static __always_inline __must_check int +memcpy_mcsafe(void *dst, const void *src, size_t cnt) +{ +#ifdef CONFIG_X86_MCE + if (static_branch_unlikely(&mcsafe_key)) + return memcpy_mcsafe_unrolled(dst, src, cnt); + else +#endif + memcpy(dst, src, cnt); + return 0; +} #endif /* __KERNEL__ */ diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index 6fa8594..dee8a70 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -81,7 +81,7 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); /* Initialize cr4 shadow for this CPU. */ static inline void cr4_init_shadow(void) { - this_cpu_write(cpu_tlbstate.cr4, __read_cr4()); + this_cpu_write(cpu_tlbstate.cr4, __read_cr4_safe()); } /* Set in this cpu's CR4. */ diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index c3f2911..2131c4c 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -433,7 +433,11 @@ do { \ #define __get_user_asm_ex(x, addr, itype, rtype, ltype) \ asm volatile("1: mov"itype" %1,%"rtype"0\n" \ "2:\n" \ - _ASM_EXTABLE_EX(1b, 2b) \ + ".section .fixup,\"ax\"\n" \ + "3:xor"itype" %"rtype"0,%"rtype"0\n" \ + " jmp 2b\n" \ + ".previous\n" \ + _ASM_EXTABLE_EX(1b, 3b) \ : ltype(x) : "m" (__m(addr))) #define __put_user_nocheck(x, ptr, size) \ @@ -705,7 +709,7 @@ static inline void copy_user_overflow(int size, unsigned long count) WARN(1, "Buffer overflow detected (%d < %lu)!\n", size, count); } -static inline unsigned long __must_check +static __always_inline unsigned long __must_check copy_from_user(void *to, const void __user *from, unsigned long n) { int sz = __compiletime_object_size(to); @@ -725,7 +729,7 @@ copy_from_user(void *to, const void __user *from, unsigned long n) return n; } -static inline unsigned long __must_check +static __always_inline unsigned long __must_check copy_to_user(void __user *to, const void *from, unsigned long n) { int sz = __compiletime_object_size(from); diff --git a/arch/x86/include/uapi/asm/mce.h b/arch/x86/include/uapi/asm/mce.h index 2184943..69a6e07 100644 --- a/arch/x86/include/uapi/asm/mce.h +++ b/arch/x86/include/uapi/asm/mce.h @@ -26,6 +26,8 @@ struct mce { __u32 socketid; /* CPU socket ID */ __u32 apicid; /* CPU initial apic ID */ __u64 mcgcap; /* MCGCAP MSR: machine check capabilities of CPU */ + __u64 synd; /* MCA_SYND MSR: only valid on SMCA systems */ + __u64 ipid; /* MCA_IPID MSR: only valid on SMCA systems */ }; #define MCE_GET_RECORD_LEN _IOR('M', 1, int) diff --git a/arch/x86/kernel/acpi/Makefile b/arch/x86/kernel/acpi/Makefile index 3242e59..26b78d8 100644 --- a/arch/x86/kernel/acpi/Makefile +++ b/arch/x86/kernel/acpi/Makefile @@ -1,6 +1,7 @@ obj-$(CONFIG_ACPI) += boot.o obj-$(CONFIG_ACPI_SLEEP) += sleep.o wakeup_$(BITS).o obj-$(CONFIG_ACPI_APEI) += apei.o +obj-$(CONFIG_ACPI_CPPC_LIB) += cppc_msr.o ifneq ($(CONFIG_ACPI_PROCESSOR),) obj-y += cstate.o diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index fc88410..32a7d70 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -1496,7 +1496,7 @@ void __init acpi_boot_table_init(void) * If acpi_disabled, bail out */ if (acpi_disabled) - return; + return; /* * Initialize the ACPI boot-time table parser. diff --git a/arch/x86/kernel/acpi/cppc_msr.c b/arch/x86/kernel/acpi/cppc_msr.c new file mode 100644 index 0000000..6fb478b --- /dev/null +++ b/arch/x86/kernel/acpi/cppc_msr.c @@ -0,0 +1,58 @@ +/* + * cppc_msr.c: MSR Interface for CPPC + * Copyright (c) 2016, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + */ + +#include <acpi/cppc_acpi.h> +#include <asm/msr.h> + +/* Refer to drivers/acpi/cppc_acpi.c for the description of functions */ + +bool cpc_ffh_supported(void) +{ + return true; +} + +int cpc_read_ffh(int cpunum, struct cpc_reg *reg, u64 *val) +{ + int err; + + err = rdmsrl_safe_on_cpu(cpunum, reg->address, val); + if (!err) { + u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + + *val &= mask; + *val >>= reg->bit_offset; + } + return err; +} + +int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) +{ + u64 rd_val; + int err; + + err = rdmsrl_safe_on_cpu(cpunum, reg->address, &rd_val); + if (!err) { + u64 mask = GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + + val <<= reg->bit_offset; + val &= mask; + rd_val &= ~mask; + rd_val |= val; + err = wrmsrl_safe_on_cpu(cpunum, reg->address, rd_val); + } + return err; +} diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 809eda0..bcc9ccc 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -804,21 +804,20 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) identify_cpu_without_cpuid(c); /* cyrix could have cpuid enabled via c_identify()*/ - if (!have_cpuid_p()) - return; + if (have_cpuid_p()) { + cpu_detect(c); + get_cpu_vendor(c); + get_cpu_cap(c); - cpu_detect(c); - get_cpu_vendor(c); - get_cpu_cap(c); - - if (this_cpu->c_early_init) - this_cpu->c_early_init(c); + if (this_cpu->c_early_init) + this_cpu->c_early_init(c); - c->cpu_index = 0; - filter_cpuid_features(c, false); + c->cpu_index = 0; + filter_cpuid_features(c, false); - if (this_cpu->c_bsp_init) - this_cpu->c_bsp_init(c); + if (this_cpu->c_bsp_init) + this_cpu->c_bsp_init(c); + } setup_force_cpu_cap(X86_FEATURE_ALWAYS); fpu__init_system(c); diff --git a/arch/x86/kernel/cpu/hypervisor.c b/arch/x86/kernel/cpu/hypervisor.c index 27e4665..35691a6 100644 --- a/arch/x86/kernel/cpu/hypervisor.c +++ b/arch/x86/kernel/cpu/hypervisor.c @@ -86,3 +86,14 @@ bool __init hypervisor_x2apic_available(void) x86_hyper->x2apic_available && x86_hyper->x2apic_available(); } + +void hypervisor_pin_vcpu(int cpu) +{ + if (!x86_hyper) + return; + + if (x86_hyper->pin_vcpu) + x86_hyper->pin_vcpu(cpu); + else + WARN_ONCE(1, "vcpu pinning requested but not supported!\n"); +} diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 79d8ec8..a7fdf45 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -41,6 +41,7 @@ #include <linux/debugfs.h> #include <linux/irq_work.h> #include <linux/export.h> +#include <linux/jump_label.h> #include <asm/processor.h> #include <asm/traps.h> @@ -292,6 +293,13 @@ static void print_mce(struct mce *m) if (m->misc) pr_cont("MISC %llx ", m->misc); + if (mce_flags.smca) { + if (m->synd) + pr_cont("SYND %llx ", m->synd); + if (m->ipid) + pr_cont("IPID %llx ", m->ipid); + } + pr_cont("\n"); /* * Note this output is parsed by external tools and old fields @@ -568,6 +576,7 @@ static void mce_read_aux(struct mce *m, int i) { if (m->status & MCI_STATUS_MISCV) m->misc = mce_rdmsrl(msr_ops.misc(i)); + if (m->status & MCI_STATUS_ADDRV) { m->addr = mce_rdmsrl(msr_ops.addr(i)); @@ -579,6 +588,23 @@ static void mce_read_aux(struct mce *m, int i) m->addr >>= shift; m->addr <<= shift; } + + /* + * Extract [55:<lsb>] where lsb is the least significant + * *valid* bit of the address bits. + */ + if (mce_flags.smca) { + u8 lsb = (m->addr >> 56) & 0x3f; + + m->addr &= GENMASK_ULL(55, lsb); + } + } + + if (mce_flags.smca) { + m->ipid = mce_rdmsrl(MSR_AMD64_SMCA_MCx_IPID(i)); + + if (m->status & MCI_STATUS_SYNDV) + m->synd = mce_rdmsrl(MSR_AMD64_SMCA_MCx_SYND(i)); } } @@ -1633,17 +1659,6 @@ static int __mcheck_cpu_apply_quirks(struct cpuinfo_x86 *c) if (c->x86 == 6 && c->x86_model == 45) quirk_no_way_out = quirk_sandybridge_ifu; - /* - * MCG_CAP.MCG_SER_P is necessary but not sufficient to know - * whether this processor will actually generate recoverable - * machine checks. Check to see if this is an E7 model Xeon. - * We can't do a model number check because E5 and E7 use the - * same model number. E5 doesn't support recovery, E7 does. - */ - if (mca_cfg.recovery || (mca_cfg.ser && - !strncmp(c->x86_model_id, - "Intel(R) Xeon(R) CPU E7-", 24))) - set_cpu_cap(c, X86_FEATURE_MCE_RECOVERY); } if (cfg->monarch_timeout < 0) cfg->monarch_timeout = 0; @@ -2080,6 +2095,7 @@ void mce_disable_bank(int bank) * mce=bootlog Log MCEs from before booting. Disabled by default on AMD. * mce=nobootlog Don't log MCEs from before booting. * mce=bios_cmci_threshold Don't program the CMCI threshold + * mce=recovery force enable memcpy_mcsafe() */ static int __init mcheck_enable(char *str) { @@ -2676,8 +2692,14 @@ static int __init mcheck_debugfs_init(void) static int __init mcheck_debugfs_init(void) { return -EINVAL; } #endif +DEFINE_STATIC_KEY_FALSE(mcsafe_key); +EXPORT_SYMBOL_GPL(mcsafe_key); + static int __init mcheck_late_init(void) { + if (mca_cfg.recovery) + static_branch_inc(&mcsafe_key); + mcheck_debugfs_init(); /* diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index 7b7f3be..9b54034 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -20,6 +20,7 @@ #include <linux/init.h> #include <linux/cpu.h> #include <linux/smp.h> +#include <linux/string.h> #include <asm/amd_nb.h> #include <asm/apic.h> @@ -63,34 +64,71 @@ static const char * const th_names[] = { "execution_unit", }; -/* Define HWID to IP type mappings for Scalable MCA */ -struct amd_hwid amd_hwids[] = { - [SMCA_F17H_CORE] = { "f17h_core", 0xB0 }, - [SMCA_DF] = { "data_fabric", 0x2E }, - [SMCA_UMC] = { "umc", 0x96 }, - [SMCA_PB] = { "param_block", 0x5 }, - [SMCA_PSP] = { "psp", 0xFF }, - [SMCA_SMU] = { "smu", 0x1 }, +static const char * const smca_umc_block_names[] = { + "dram_ecc", + "misc_umc" }; -EXPORT_SYMBOL_GPL(amd_hwids); - -const char * const amd_core_mcablock_names[] = { - [SMCA_LS] = "load_store", - [SMCA_IF] = "insn_fetch", - [SMCA_L2_CACHE] = "l2_cache", - [SMCA_DE] = "decode_unit", - [RES] = "", - [SMCA_EX] = "execution_unit", - [SMCA_FP] = "floating_point", - [SMCA_L3_CACHE] = "l3_cache", + +struct smca_bank_name smca_bank_names[] = { + [SMCA_LS] = { "load_store", "Load Store Unit" }, + [SMCA_IF] = { "insn_fetch", "Instruction Fetch Unit" }, + [SMCA_L2_CACHE] = { "l2_cache", "L2 Cache" }, + [SMCA_DE] = { "decode_unit", "Decode Unit" }, + [SMCA_EX] = { "execution_unit", "Execution Unit" }, + [SMCA_FP] = { "floating_point", "Floating Point Unit" }, + [SMCA_L3_CACHE] = { "l3_cache", "L3 Cache" }, + [SMCA_CS] = { "coherent_slave", "Coherent Slave" }, + [SMCA_PIE] = { "pie", "Power, Interrupts, etc." }, + [SMCA_UMC] = { "umc", "Unified Memory Controller" }, + [SMCA_PB] = { "param_block", "Parameter Block" }, + [SMCA_PSP] = { "psp", "Platform Security Processor" }, + [SMCA_SMU] = { "smu", "System Management Unit" }, }; -EXPORT_SYMBOL_GPL(amd_core_mcablock_names); +EXPORT_SYMBOL_GPL(smca_bank_names); + +static struct smca_hwid_mcatype smca_hwid_mcatypes[] = { + /* { bank_type, hwid_mcatype, xec_bitmap } */ + + /* ZN Core (HWID=0xB0) MCA types */ + { SMCA_LS, HWID_MCATYPE(0xB0, 0x0), 0x1FFFEF }, + { SMCA_IF, HWID_MCATYPE(0xB0, 0x1), 0x3FFF }, + { SMCA_L2_CACHE, HWID_MCATYPE(0xB0, 0x2), 0xF }, + { SMCA_DE, HWID_MCATYPE(0xB0, 0x3), 0x1FF }, + /* HWID 0xB0 MCATYPE 0x4 is Reserved */ + { SMCA_EX, HWID_MCATYPE(0xB0, 0x5), 0x7FF }, + { SMCA_FP, HWID_MCATYPE(0xB0, 0x6), 0x7F }, + { SMCA_L3_CACHE, HWID_MCATYPE(0xB0, 0x7), 0xFF }, + + /* Data Fabric MCA types */ + { SMCA_CS, HWID_MCATYPE(0x2E, 0x0), 0x1FF }, + { SMCA_PIE, HWID_MCATYPE(0x2E, 0x1), 0xF }, + + /* Unified Memory Controller MCA type */ + { SMCA_UMC, HWID_MCATYPE(0x96, 0x0), 0x3F }, + + /* Parameter Block MCA type */ + { SMCA_PB, HWID_MCATYPE(0x05, 0x0), 0x1 }, + + /* Platform Security Processor MCA type */ + { SMCA_PSP, HWID_MCATYPE(0xFF, 0x0), 0x1 }, -const char * const amd_df_mcablock_names[] = { - [SMCA_CS] = "coherent_slave", - [SMCA_PIE] = "pie", + /* System Management Unit MCA type */ + { SMCA_SMU, HWID_MCATYPE(0x01, 0x0), 0x1 }, }; -EXPORT_SYMBOL_GPL(amd_df_mcablock_names); + +struct smca_bank_info smca_banks[MAX_NR_BANKS]; +EXPORT_SYMBOL_GPL(smca_banks); + +/* + * In SMCA enabled processors, we can have multiple banks for a given IP type. + * So to define a unique name for each bank, we use a temp c-string to append + * the MCA_IPID[InstanceId] to type's name in get_name(). + * + * InstanceId is 32 bits which is 8 characters. Make sure MAX_MCATYPE_NAME_LEN + * is greater than 8 plus 1 (for underscore) plus length of longest type name. + */ +#define MAX_MCATYPE_NAME_LEN 30 +static char buf_mcatype[MAX_MCATYPE_NAME_LEN]; static DEFINE_PER_CPU(struct threshold_bank **, threshold_banks); static DEFINE_PER_CPU(unsigned int, bank_map); /* see which banks are on */ @@ -108,6 +146,36 @@ void (*deferred_error_int_vector)(void) = default_deferred_error_interrupt; * CPU Initialization */ +static void get_smca_bank_info(unsigned int bank) +{ + unsigned int i, hwid_mcatype, cpu = smp_processor_id(); + struct smca_hwid_mcatype *type; + u32 high, instanceId; + u16 hwid, mcatype; + + /* Collect bank_info using CPU 0 for now. */ + if (cpu) + return; + + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &instanceId, &high)) { + pr_warn("Failed to read MCA_IPID for bank %d\n", bank); + return; + } + + hwid = high & MCI_IPID_HWID; + mcatype = (high & MCI_IPID_MCATYPE) >> 16; + hwid_mcatype = HWID_MCATYPE(hwid, mcatype); + + for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { + type = &smca_hwid_mcatypes[i]; + if (hwid_mcatype == type->hwid_mcatype) { + smca_banks[bank].type = type; + smca_banks[bank].type_instance = instanceId; + break; + } + } +} + struct thresh_restart { struct threshold_block *b; int reset; @@ -293,7 +361,7 @@ static void deferred_error_interrupt_enable(struct cpuinfo_x86 *c) wrmsr(MSR_CU_DEF_ERR, low, high); } -static u32 get_block_address(u32 current_addr, u32 low, u32 high, +static u32 get_block_address(unsigned int cpu, u32 current_addr, u32 low, u32 high, unsigned int bank, unsigned int block) { u32 addr = 0, offset = 0; @@ -309,13 +377,13 @@ static u32 get_block_address(u32 current_addr, u32 low, u32 high, */ u32 low, high; - if (rdmsr_safe(MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) + if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_CONFIG(bank), &low, &high)) return addr; if (!(low & MCI_CONFIG_MCAX)) return addr; - if (!rdmsr_safe(MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && + if (!rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_MISC(bank), &low, &high) && (low & MASK_BLKPTR_LO)) addr = MSR_AMD64_SMCA_MCx_MISCy(bank, block - 1); } @@ -395,6 +463,20 @@ prepare_threshold_block(unsigned int bank, unsigned int block, u32 addr, */ smca_high &= ~BIT(2); + /* + * SMCA sets the Deferred Error Interrupt type per bank. + * + * MCA_CONFIG[DeferredIntTypeSupported] is bit 5, and tells us + * if the DeferredIntType bit field is available. + * + * MCA_CONFIG[DeferredIntType] is bits [38:37] ([6:5] in the + * high portion of the MSR). OS should set this to 0x1 to enable + * APIC based interrupt. First, check that no interrupt has been + * set. + */ + if ((smca_low & BIT(5)) && !((smca_high >> 5) & 0x3)) + smca_high |= BIT(5); + wrmsr(smca_addr, smca_low, smca_high); } @@ -421,12 +503,15 @@ out: void mce_amd_feature_init(struct cpuinfo_x86 *c) { u32 low = 0, high = 0, address = 0; - unsigned int bank, block; + unsigned int bank, block, cpu = smp_processor_id(); int offset = -1; for (bank = 0; bank < mca_cfg.banks; ++bank) { + if (mce_flags.smca) + get_smca_bank_info(bank); + for (block = 0; block < NR_BLOCKS; ++block) { - address = get_block_address(address, low, high, bank, block); + address = get_block_address(cpu, address, low, high, bank, block); if (!address) break; @@ -476,9 +561,27 @@ __log_error(unsigned int bank, bool deferred_err, bool threshold_err, u64 misc) if (threshold_err) m.misc = misc; - if (m.status & MCI_STATUS_ADDRV) + if (m.status & MCI_STATUS_ADDRV) { rdmsrl(msr_addr, m.addr); + /* + * Extract [55:<lsb>] where lsb is the least significant + * *valid* bit of the address bits. + */ + if (mce_flags.smca) { + u8 lsb = (m.addr >> 56) & 0x3f; + + m.addr &= GENMASK_ULL(55, lsb); + } + } + + if (mce_flags.smca) { + rdmsrl(MSR_AMD64_SMCA_MCx_IPID(bank), m.ipid); + + if (m.status & MCI_STATUS_SYNDV) + rdmsrl(MSR_AMD64_SMCA_MCx_SYND(bank), m.synd); + } + mce_log(&m); wrmsrl(msr_status, 0); @@ -541,15 +644,14 @@ static void amd_deferred_error_interrupt(void) static void amd_threshold_interrupt(void) { u32 low = 0, high = 0, address = 0; - int cpu = smp_processor_id(); - unsigned int bank, block; + unsigned int bank, block, cpu = smp_processor_id(); /* assume first bank caused it */ for (bank = 0; bank < mca_cfg.banks; ++bank) { if (!(per_cpu(bank_map, cpu) & (1 << bank))) continue; for (block = 0; block < NR_BLOCKS; ++block) { - address = get_block_address(address, low, high, bank, block); + address = get_block_address(cpu, address, low, high, bank, block); if (!address) break; @@ -713,6 +815,34 @@ static struct kobj_type threshold_ktype = { .default_attrs = default_attrs, }; +static const char *get_name(unsigned int bank, struct threshold_block *b) +{ + unsigned int bank_type; + + if (!mce_flags.smca) { + if (b && bank == 4) + return bank4_names(b); + + return th_names[bank]; + } + + if (!smca_banks[bank].type) + return NULL; + + bank_type = smca_banks[bank].type->bank_type; + + if (b && bank_type == SMCA_UMC) { + if (b->block < ARRAY_SIZE(smca_umc_block_names)) + return smca_umc_block_names[b->block]; + return NULL; + } + + snprintf(buf_mcatype, MAX_MCATYPE_NAME_LEN, + "%s_%x", smca_bank_names[bank_type].name, + smca_banks[bank].type_instance); + return buf_mcatype; +} + static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, unsigned int block, u32 address) { @@ -767,11 +897,11 @@ static int allocate_threshold_blocks(unsigned int cpu, unsigned int bank, err = kobject_init_and_add(&b->kobj, &threshold_ktype, per_cpu(threshold_banks, cpu)[bank]->kobj, - (bank == 4 ? bank4_names(b) : th_names[bank])); + get_name(bank, b)); if (err) goto out_free; recurse: - address = get_block_address(address, low, high, bank, ++block); + address = get_block_address(cpu, address, low, high, bank, ++block); if (!address) return 0; @@ -822,7 +952,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) struct device *dev = per_cpu(mce_device, cpu); struct amd_northbridge *nb = NULL; struct threshold_bank *b = NULL; - const char *name = th_names[bank]; + const char *name = get_name(bank, NULL); int err = 0; if (is_shared_bank(bank)) { @@ -869,7 +999,7 @@ static int threshold_create_bank(unsigned int cpu, unsigned int bank) } } - err = allocate_threshold_blocks(cpu, bank, 0, MSR_IA32_MCx_MISC(bank)); + err = allocate_threshold_blocks(cpu, bank, 0, msr_ops.misc(bank)); if (!err) goto out; diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 1726c4c..865058d 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -575,9 +575,6 @@ static void kvm_kick_cpu(int cpu) kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); } - -#ifdef CONFIG_QUEUED_SPINLOCKS - #include <asm/qspinlock.h> static void kvm_wait(u8 *ptr, u8 val) @@ -606,243 +603,6 @@ out: local_irq_restore(flags); } -#else /* !CONFIG_QUEUED_SPINLOCKS */ - -enum kvm_contention_stat { - TAKEN_SLOW, - TAKEN_SLOW_PICKUP, - RELEASED_SLOW, - RELEASED_SLOW_KICKED, - NR_CONTENTION_STATS -}; - -#ifdef CONFIG_KVM_DEBUG_FS -#define HISTO_BUCKETS 30 - -static struct kvm_spinlock_stats -{ - u32 contention_stats[NR_CONTENTION_STATS]; - u32 histo_spin_blocked[HISTO_BUCKETS+1]; - u64 time_blocked; -} spinlock_stats; - -static u8 zero_stats; - -static inline void check_zero(void) -{ - u8 ret; - u8 old; - - old = READ_ONCE(zero_stats); - if (unlikely(old)) { - ret = cmpxchg(&zero_stats, old, 0); - /* This ensures only one fellow resets the stat */ - if (ret == old) - memset(&spinlock_stats, 0, sizeof(spinlock_stats)); - } -} - -static inline void add_stats(enum kvm_contention_stat var, u32 val) -{ - check_zero(); - spinlock_stats.contention_stats[var] += val; -} - - -static inline u64 spin_time_start(void) -{ - return sched_clock(); -} - -static void __spin_time_accum(u64 delta, u32 *array) -{ - unsigned index; - - index = ilog2(delta); - check_zero(); - - if (index < HISTO_BUCKETS) - array[index]++; - else - array[HISTO_BUCKETS]++; -} - -static inline void spin_time_accum_blocked(u64 start) -{ - u32 delta; - - delta = sched_clock() - start; - __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); - spinlock_stats.time_blocked += delta; -} - -static struct dentry *d_spin_debug; -static struct dentry *d_kvm_debug; - -static struct dentry *kvm_init_debugfs(void) -{ - d_kvm_debug = debugfs_create_dir("kvm-guest", NULL); - if (!d_kvm_debug) - printk(KERN_WARNING "Could not create 'kvm' debugfs directory\n"); - - return d_kvm_debug; -} - -static int __init kvm_spinlock_debugfs(void) -{ - struct dentry *d_kvm; - - d_kvm = kvm_init_debugfs(); - if (d_kvm == NULL) - return -ENOMEM; - - d_spin_debug = debugfs_create_dir("spinlocks", d_kvm); - - debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); - - debugfs_create_u32("taken_slow", 0444, d_spin_debug, - &spinlock_stats.contention_stats[TAKEN_SLOW]); - debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, - &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]); - - debugfs_create_u32("released_slow", 0444, d_spin_debug, - &spinlock_stats.contention_stats[RELEASED_SLOW]); - debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, - &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); - - debugfs_create_u64("time_blocked", 0444, d_spin_debug, - &spinlock_stats.time_blocked); - - debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, - spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); - - return 0; -} -fs_initcall(kvm_spinlock_debugfs); -#else /* !CONFIG_KVM_DEBUG_FS */ -static inline void add_stats(enum kvm_contention_stat var, u32 val) -{ -} - -static inline u64 spin_time_start(void) -{ - return 0; -} - -static inline void spin_time_accum_blocked(u64 start) -{ -} -#endif /* CONFIG_KVM_DEBUG_FS */ - -struct kvm_lock_waiting { - struct arch_spinlock *lock; - __ticket_t want; -}; - -/* cpus 'waiting' on a spinlock to become available */ -static cpumask_t waiting_cpus; - -/* Track spinlock on which a cpu is waiting */ -static DEFINE_PER_CPU(struct kvm_lock_waiting, klock_waiting); - -__visible void kvm_lock_spinning(struct arch_spinlock *lock, __ticket_t want) -{ - struct kvm_lock_waiting *w; - int cpu; - u64 start; - unsigned long flags; - __ticket_t head; - - if (in_nmi()) - return; - - w = this_cpu_ptr(&klock_waiting); - cpu = smp_processor_id(); - start = spin_time_start(); - - /* - * Make sure an interrupt handler can't upset things in a - * partially setup state. - */ - local_irq_save(flags); - - /* - * The ordering protocol on this is that the "lock" pointer - * may only be set non-NULL if the "want" ticket is correct. - * If we're updating "want", we must first clear "lock". - */ - w->lock = NULL; - smp_wmb(); - w->want = want; - smp_wmb(); - w->lock = lock; - - add_stats(TAKEN_SLOW, 1); - - /* - * This uses set_bit, which is atomic but we should not rely on its - * reordering gurantees. So barrier is needed after this call. - */ - cpumask_set_cpu(cpu, &waiting_cpus); - - barrier(); - - /* - * Mark entry to slowpath before doing the pickup test to make - * sure we don't deadlock with an unlocker. - */ - __ticket_enter_slowpath(lock); - - /* make sure enter_slowpath, which is atomic does not cross the read */ - smp_mb__after_atomic(); - - /* - * check again make sure it didn't become free while - * we weren't looking. - */ - head = READ_ONCE(lock->tickets.head); - if (__tickets_equal(head, want)) { - add_stats(TAKEN_SLOW_PICKUP, 1); - goto out; - } - - /* - * halt until it's our turn and kicked. Note that we do safe halt - * for irq enabled case to avoid hang when lock info is overwritten - * in irq spinlock slowpath and no spurious interrupt occur to save us. - */ - if (arch_irqs_disabled_flags(flags)) - halt(); - else - safe_halt(); - -out: - cpumask_clear_cpu(cpu, &waiting_cpus); - w->lock = NULL; - local_irq_restore(flags); - spin_time_accum_blocked(start); -} -PV_CALLEE_SAVE_REGS_THUNK(kvm_lock_spinning); - -/* Kick vcpu waiting on @lock->head to reach value @ticket */ -static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) -{ - int cpu; - - add_stats(RELEASED_SLOW, 1); - for_each_cpu(cpu, &waiting_cpus) { - const struct kvm_lock_waiting *w = &per_cpu(klock_waiting, cpu); - if (READ_ONCE(w->lock) == lock && - READ_ONCE(w->want) == ticket) { - add_stats(RELEASED_SLOW_KICKED, 1); - kvm_kick_cpu(cpu); - break; - } - } -} - -#endif /* !CONFIG_QUEUED_SPINLOCKS */ - /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. */ @@ -854,16 +614,11 @@ void __init kvm_spinlock_init(void) if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) return; -#ifdef CONFIG_QUEUED_SPINLOCKS __pv_init_lock_hash(); pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); pv_lock_ops.wait = kvm_wait; pv_lock_ops.kick = kvm_kick_cpu; -#else /* !CONFIG_QUEUED_SPINLOCKS */ - pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); - pv_lock_ops.unlock_kick = kvm_unlock_kick; -#endif } static __init int kvm_spinlock_init_jump(void) diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c index 1d39bfb..3692249 100644 --- a/arch/x86/kernel/kvmclock.c +++ b/arch/x86/kernel/kvmclock.c @@ -289,6 +289,7 @@ void __init kvmclock_init(void) put_cpu(); x86_platform.calibrate_tsc = kvm_get_tsc_khz; + x86_platform.calibrate_cpu = kvm_get_tsc_khz; x86_platform.get_wallclock = kvm_get_wallclock; x86_platform.set_wallclock = kvm_set_wallclock; #ifdef CONFIG_X86_LOCAL_APIC diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index 1939a02..2c55a00 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -8,7 +8,6 @@ #include <asm/paravirt.h> -#ifdef CONFIG_QUEUED_SPINLOCKS __visible void __native_queued_spin_unlock(struct qspinlock *lock) { native_queued_spin_unlock(lock); @@ -21,19 +20,13 @@ bool pv_is_native_spin_unlock(void) return pv_lock_ops.queued_spin_unlock.func == __raw_callee_save___native_queued_spin_unlock; } -#endif struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP -#ifdef CONFIG_QUEUED_SPINLOCKS .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), .wait = paravirt_nop, .kick = paravirt_nop, -#else /* !CONFIG_QUEUED_SPINLOCKS */ - .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop), - .unlock_kick = paravirt_nop, -#endif /* !CONFIG_QUEUED_SPINLOCKS */ #endif /* SMP */ }; EXPORT_SYMBOL(pv_lock_ops); diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index 158dc06..920c6ae 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -10,7 +10,7 @@ DEF_NATIVE(pv_mmu_ops, write_cr3, "mov %eax, %cr3"); DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +#if defined(CONFIG_PARAVIRT_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)"); #endif @@ -49,7 +49,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); PATCH_SITE(pv_cpu_ops, clts); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +#if defined(CONFIG_PARAVIRT_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): if (pv_is_native_spin_unlock()) { start = start_pv_lock_ops_queued_spin_unlock; diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index e70087a..bb3840c 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -19,7 +19,7 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); DEF_NATIVE(, mov32, "mov %edi, %eax"); DEF_NATIVE(, mov64, "mov %rdi, %rax"); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +#if defined(CONFIG_PARAVIRT_SPINLOCKS) DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)"); #endif @@ -61,7 +61,7 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); -#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +#if defined(CONFIG_PARAVIRT_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): if (pv_is_native_spin_unlock()) { start = start_pv_lock_ops_queued_spin_unlock; diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index cc457ff..51402a7 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -626,3 +626,34 @@ DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3, amd_disable_seq_and_redirect_scrub); #endif + +#if defined(CONFIG_X86_64) && defined(CONFIG_X86_MCE) +#include <linux/jump_label.h> +#include <asm/string_64.h> + +/* Ivy Bridge, Haswell, Broadwell */ +static void quirk_intel_brickland_xeon_ras_cap(struct pci_dev *pdev) +{ + u32 capid0; + + pci_read_config_dword(pdev, 0x84, &capid0); + + if (capid0 & 0x10) + static_branch_inc(&mcsafe_key); +} + +/* Skylake */ +static void quirk_intel_purley_xeon_ras_cap(struct pci_dev *pdev) +{ + u32 capid0; + + pci_read_config_dword(pdev, 0x84, &capid0); + + if ((capid0 & 0xc0) == 0xc0) + static_branch_inc(&mcsafe_key); +} +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x0ec3, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, quirk_intel_brickland_xeon_ras_cap); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2083, quirk_intel_purley_xeon_ras_cap); +#endif diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index cbf5634..2c4bc85 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -1096,19 +1096,19 @@ void __init setup_arch(char **cmdline_p) memblock_set_current_limit(ISA_END_ADDRESS); memblock_x86_fill(); - if (efi_enabled(EFI_BOOT)) { + reserve_bios_regions(); + + if (efi_enabled(EFI_MEMMAP)) { efi_fake_memmap(); efi_find_mirror(); - } - - reserve_bios_regions(); + efi_esrt_init(); - /* - * The EFI specification says that boot service code won't be called - * after ExitBootServices(). This is, in fact, a lie. - */ - if (efi_enabled(EFI_MEMMAP)) + /* + * The EFI specification says that boot service code won't be + * called after ExitBootServices(). This is, in fact, a lie. + */ efi_reserve_boot_services(); + } /* preallocate 4k for mptable mpc */ early_reserve_e820_mpc_new(); @@ -1137,9 +1137,7 @@ void __init setup_arch(char **cmdline_p) * auditing all the early-boot CR4 manipulation would be needed to * rule it out. */ - if (boot_cpu_data.cpuid_level >= 0) - /* A CPU has %cr4 if and only if it has CPUID. */ - mmu_cr4_features = __read_cr4(); + mmu_cr4_features = __read_cr4_safe(); memblock_set_current_limit(get_max_mapped()); diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index c7a2f50b..54e2f1a 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -471,7 +471,7 @@ static bool match_die(struct cpuinfo_x86 *c, struct cpuinfo_x86 *o) return false; } -static struct sched_domain_topology_level numa_inside_package_topology[] = { +static struct sched_domain_topology_level x86_numa_in_package_topology[] = { #ifdef CONFIG_SCHED_SMT { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, #endif @@ -480,22 +480,23 @@ static struct sched_domain_topology_level numa_inside_package_topology[] = { #endif { NULL, }, }; + +static struct sched_domain_topology_level x86_topology[] = { +#ifdef CONFIG_SCHED_SMT + { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, +#endif +#ifdef CONFIG_SCHED_MC + { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, +#endif + { cpu_cpu_mask, SD_INIT_NAME(DIE) }, + { NULL, }, +}; + /* - * set_sched_topology() sets the topology internal to a CPU. The - * NUMA topologies are layered on top of it to build the full - * system topology. - * - * If NUMA nodes are observed to occur within a CPU package, this - * function should be called. It forces the sched domain code to - * only use the SMT level for the CPU portion of the topology. - * This essentially falls back to relying on NUMA information - * from the SRAT table to describe the entire system topology - * (except for hyperthreads). + * Set if a package/die has multiple NUMA nodes inside. + * AMD Magny-Cours and Intel Cluster-on-Die have this. */ -static void primarily_use_numa_for_topology(void) -{ - set_sched_topology(numa_inside_package_topology); -} +static bool x86_has_numa_in_package; void set_cpu_sibling_map(int cpu) { @@ -558,7 +559,7 @@ void set_cpu_sibling_map(int cpu) c->booted_cores = cpu_data(i).booted_cores; } if (match_die(c, o) && !topology_same_node(c, o)) - primarily_use_numa_for_topology(); + x86_has_numa_in_package = true; } threads = cpumask_weight(topology_sibling_cpumask(cpu)); @@ -1304,6 +1305,16 @@ void __init native_smp_prepare_cpus(unsigned int max_cpus) zalloc_cpumask_var(&per_cpu(cpu_core_map, i), GFP_KERNEL); zalloc_cpumask_var(&per_cpu(cpu_llc_shared_map, i), GFP_KERNEL); } + + /* + * Set 'default' x86 topology, this matches default_topology() in that + * it has NUMA nodes as a topology level. See also + * native_smp_cpus_done(). + * + * Must be done before set_cpus_sibling_map() is ran. + */ + set_sched_topology(x86_topology); + set_cpu_sibling_map(0); switch (smp_sanity_check(max_cpus)) { @@ -1369,6 +1380,9 @@ void __init native_smp_cpus_done(unsigned int max_cpus) { pr_debug("Boot done\n"); + if (x86_has_numa_in_package) + set_sched_topology(x86_numa_in_package_topology); + nmi_selftest(); impress_friends(); setup_ioapic_dest(); diff --git a/arch/x86/kernel/x8664_ksyms_64.c b/arch/x86/kernel/x8664_ksyms_64.c index 95e49f6..b2cee3d1 100644 --- a/arch/x86/kernel/x8664_ksyms_64.c +++ b/arch/x86/kernel/x8664_ksyms_64.c @@ -38,7 +38,7 @@ EXPORT_SYMBOL(__copy_user_nocache); EXPORT_SYMBOL(_copy_from_user); EXPORT_SYMBOL(_copy_to_user); -EXPORT_SYMBOL_GPL(memcpy_mcsafe); +EXPORT_SYMBOL_GPL(memcpy_mcsafe_unrolled); EXPORT_SYMBOL(copy_page); EXPORT_SYMBOL(clear_page); diff --git a/arch/x86/kvm/ioapic.c b/arch/x86/kvm/ioapic.c index 5f42d03..c7220ba 100644 --- a/arch/x86/kvm/ioapic.c +++ b/arch/x86/kvm/ioapic.c @@ -109,6 +109,7 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) { bool new_val, old_val; struct kvm_ioapic *ioapic = vcpu->kvm->arch.vioapic; + struct dest_map *dest_map = &ioapic->rtc_status.dest_map; union kvm_ioapic_redirect_entry *e; e = &ioapic->redirtbl[RTC_GSI]; @@ -117,16 +118,17 @@ static void __rtc_irq_eoi_tracking_restore_one(struct kvm_vcpu *vcpu) return; new_val = kvm_apic_pending_eoi(vcpu, e->fields.vector); - old_val = test_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map); + old_val = test_bit(vcpu->vcpu_id, dest_map->map); if (new_val == old_val) return; if (new_val) { - __set_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map); + __set_bit(vcpu->vcpu_id, dest_map->map); + dest_map->vectors[vcpu->vcpu_id] = e->fields.vector; ioapic->rtc_status.pending_eoi++; } else { - __clear_bit(vcpu->vcpu_id, ioapic->rtc_status.dest_map.map); + __clear_bit(vcpu->vcpu_id, dest_map->map); ioapic->rtc_status.pending_eoi--; rtc_status_pending_eoi_check_valid(ioapic); } diff --git a/arch/x86/kvm/pmu_amd.c b/arch/x86/kvm/pmu_amd.c index 39b9112..cd94443 100644 --- a/arch/x86/kvm/pmu_amd.c +++ b/arch/x86/kvm/pmu_amd.c @@ -23,8 +23,8 @@ static struct kvm_event_hw_type_mapping amd_event_mapping[] = { [0] = { 0x76, 0x00, PERF_COUNT_HW_CPU_CYCLES }, [1] = { 0xc0, 0x00, PERF_COUNT_HW_INSTRUCTIONS }, - [2] = { 0x80, 0x00, PERF_COUNT_HW_CACHE_REFERENCES }, - [3] = { 0x81, 0x00, PERF_COUNT_HW_CACHE_MISSES }, + [2] = { 0x7d, 0x07, PERF_COUNT_HW_CACHE_REFERENCES }, + [3] = { 0x7e, 0x07, PERF_COUNT_HW_CACHE_MISSES }, [4] = { 0xc2, 0x00, PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, [5] = { 0xc3, 0x00, PERF_COUNT_HW_BRANCH_MISSES }, [6] = { 0xd0, 0x00, PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 19f9f9e..699f872 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -2743,16 +2743,16 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) if (tsc_delta < 0) mark_tsc_unstable("KVM discovered backwards TSC"); - if (kvm_lapic_hv_timer_in_use(vcpu) && - kvm_x86_ops->set_hv_timer(vcpu, - kvm_get_lapic_tscdeadline_msr(vcpu))) - kvm_lapic_switch_to_sw_timer(vcpu); if (check_tsc_unstable()) { u64 offset = kvm_compute_tsc_offset(vcpu, vcpu->arch.last_guest_tsc); kvm_x86_ops->write_tsc_offset(vcpu, offset); vcpu->arch.tsc_catchup = 1; } + if (kvm_lapic_hv_timer_in_use(vcpu) && + kvm_x86_ops->set_hv_timer(vcpu, + kvm_get_lapic_tscdeadline_msr(vcpu))) + kvm_lapic_switch_to_sw_timer(vcpu); /* * On a host with synchronized TSC, there is no need to update * kvmclock on vcpu->cpu migration diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 2ec0b0abb..49e6eba 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -181,11 +181,11 @@ ENDPROC(memcpy_orig) #ifndef CONFIG_UML /* - * memcpy_mcsafe - memory copy with machine check exception handling + * memcpy_mcsafe_unrolled - memory copy with machine check exception handling * Note that we only catch machine checks when reading the source addresses. * Writes to target are posted and don't generate machine checks. */ -ENTRY(memcpy_mcsafe) +ENTRY(memcpy_mcsafe_unrolled) cmpl $8, %edx /* Less than 8 bytes? Go to byte copy loop */ jb .L_no_whole_words @@ -273,7 +273,7 @@ ENTRY(memcpy_mcsafe) .L_done_memcpy_trap: xorq %rax, %rax ret -ENDPROC(memcpy_mcsafe) +ENDPROC(memcpy_mcsafe_unrolled) .section .fixup, "ax" /* Return -EFAULT for any failure */ diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 849dc09..e3353c9 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -917,11 +917,11 @@ static void populate_pte(struct cpa_data *cpa, } } -static int populate_pmd(struct cpa_data *cpa, - unsigned long start, unsigned long end, - unsigned num_pages, pud_t *pud, pgprot_t pgprot) +static long populate_pmd(struct cpa_data *cpa, + unsigned long start, unsigned long end, + unsigned num_pages, pud_t *pud, pgprot_t pgprot) { - unsigned int cur_pages = 0; + long cur_pages = 0; pmd_t *pmd; pgprot_t pmd_pgprot; @@ -991,12 +991,12 @@ static int populate_pmd(struct cpa_data *cpa, return num_pages; } -static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, - pgprot_t pgprot) +static long populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, + pgprot_t pgprot) { pud_t *pud; unsigned long end; - int cur_pages = 0; + long cur_pages = 0; pgprot_t pud_pgprot; end = start + (cpa->numpages << PAGE_SHIFT); @@ -1052,7 +1052,7 @@ static int populate_pud(struct cpa_data *cpa, unsigned long start, pgd_t *pgd, /* Map trailing leftover */ if (start < end) { - int tmp; + long tmp; pud = pud_offset(pgd, start); if (pud_none(*pud)) @@ -1078,7 +1078,7 @@ static int populate_pgd(struct cpa_data *cpa, unsigned long addr) pgprot_t pgprot = __pgprot(_KERNPG_TABLE); pud_t *pud = NULL; /* shut up gcc */ pgd_t *pgd_entry; - int ret; + long ret; pgd_entry = cpa->pgd + pgd_index(addr); @@ -1327,7 +1327,8 @@ static int cpa_process_alias(struct cpa_data *cpa) static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) { - int ret, numpages = cpa->numpages; + unsigned long numpages = cpa->numpages; + int ret; while (numpages) { /* diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index ecb1b69..170cc4f 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -927,9 +927,10 @@ int track_pfn_copy(struct vm_area_struct *vma) } /* - * prot is passed in as a parameter for the new mapping. If the vma has a - * linear pfn mapping for the entire range reserve the entire vma range with - * single reserve_pfn_range call. + * prot is passed in as a parameter for the new mapping. If the vma has + * a linear pfn mapping for the entire range, or no vma is provided, + * reserve the entire pfn + size range with single reserve_pfn_range + * call. */ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size) @@ -938,11 +939,12 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, enum page_cache_mode pcm; /* reserve the whole chunk starting from paddr */ - if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { + if (!vma || (addr == vma->vm_start + && size == (vma->vm_end - vma->vm_start))) { int ret; ret = reserve_pfn_range(paddr, size, prot, 0); - if (!ret) + if (ret == 0 && vma) vma->vm_flags |= VM_PAT; return ret; } @@ -997,7 +999,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, resource_size_t paddr; unsigned long prot; - if (!(vma->vm_flags & VM_PAT)) + if (vma && !(vma->vm_flags & VM_PAT)) return; /* free the chunk starting from pfn or the whole chunk */ @@ -1011,7 +1013,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, size = vma->vm_end - vma->vm_start; } free_pfn_range(paddr, size); - vma->vm_flags &= ~VM_PAT; + if (vma) + vma->vm_flags &= ~VM_PAT; } /* diff --git a/arch/x86/pci/fixup.c b/arch/x86/pci/fixup.c index 837ea36..6d52b94 100644 --- a/arch/x86/pci/fixup.c +++ b/arch/x86/pci/fixup.c @@ -553,15 +553,21 @@ static void twinhead_reserve_killing_zone(struct pci_dev *dev) DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_INTEL, 0x27B9, twinhead_reserve_killing_zone); /* - * Broadwell EP Home Agent BARs erroneously return non-zero values when read. + * Device [8086:2fc0] + * Erratum HSE43 + * CONFIG_TDP_NOMINAL CSR Implemented at Incorrect Offset + * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v3-spec-update.html * - * See http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html - * entry BDF2. + * Devices [8086:6f60,6fa0,6fc0] + * Erratum BDF2 + * PCI BARs in the Home Agent Will Return Non-Zero Values During Enumeration + * http://www.intel.com/content/www/us/en/processors/xeon/xeon-e5-v4-spec-update.html */ -static void pci_bdwep_bar(struct pci_dev *dev) +static void pci_invalid_bar(struct pci_dev *dev) { dev->non_compliant_bars = 1; } -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_bdwep_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_bdwep_bar); -DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_bdwep_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x2fc0, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6f60, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fa0, pci_invalid_bar); +DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_INTEL, 0x6fc0, pci_invalid_bar); diff --git a/arch/x86/platform/efi/efi-bgrt.c b/arch/x86/platform/efi/efi-bgrt.c index 6a2f569..6aad870 100644 --- a/arch/x86/platform/efi/efi-bgrt.c +++ b/arch/x86/platform/efi/efi-bgrt.c @@ -82,21 +82,12 @@ void __init efi_bgrt_init(void) } bgrt_image_size = bmp_header.size; - bgrt_image = kmalloc(bgrt_image_size, GFP_KERNEL | __GFP_NOWARN); + bgrt_image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB); if (!bgrt_image) { - pr_notice("Ignoring BGRT: failed to allocate memory for image (wanted %zu bytes)\n", - bgrt_image_size); - return; - } - - image = memremap(bgrt_tab->image_address, bmp_header.size, MEMREMAP_WB); - if (!image) { pr_notice("Ignoring BGRT: failed to map image memory\n"); - kfree(bgrt_image); bgrt_image = NULL; return; } - memcpy(bgrt_image, image, bgrt_image_size); - memunmap(image); + efi_mem_reserve(bgrt_tab->image_address, bgrt_image_size); } diff --git a/arch/x86/platform/efi/efi.c b/arch/x86/platform/efi/efi.c index 1fbb408..0955c70 100644 --- a/arch/x86/platform/efi/efi.c +++ b/arch/x86/platform/efi/efi.c @@ -172,7 +172,9 @@ static void __init do_add_efi_memmap(void) int __init efi_memblock_x86_reserve_range(void) { struct efi_info *e = &boot_params.efi_info; + struct efi_memory_map_data data; phys_addr_t pmap; + int rv; if (efi_enabled(EFI_PARAVIRT)) return 0; @@ -187,11 +189,17 @@ int __init efi_memblock_x86_reserve_range(void) #else pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32)); #endif - efi.memmap.phys_map = pmap; - efi.memmap.nr_map = e->efi_memmap_size / - e->efi_memdesc_size; - efi.memmap.desc_size = e->efi_memdesc_size; - efi.memmap.desc_version = e->efi_memdesc_version; + data.phys_map = pmap; + data.size = e->efi_memmap_size; + data.desc_size = e->efi_memdesc_size; + data.desc_version = e->efi_memdesc_version; + + rv = efi_memmap_init_early(&data); + if (rv) + return rv; + + if (add_efi_memmap) + do_add_efi_memmap(); WARN(efi.memmap.desc_version != 1, "Unexpected EFI_MEMORY_DESCRIPTOR version %ld", @@ -218,19 +226,6 @@ void __init efi_print_memmap(void) } } -void __init efi_unmap_memmap(void) -{ - unsigned long size; - - clear_bit(EFI_MEMMAP, &efi.flags); - - size = efi.memmap.nr_map * efi.memmap.desc_size; - if (efi.memmap.map) { - early_memunmap(efi.memmap.map, size); - efi.memmap.map = NULL; - } -} - static int __init efi_systab_init(void *phys) { if (efi_enabled(EFI_64BIT)) { @@ -414,33 +409,6 @@ static int __init efi_runtime_init(void) return 0; } -static int __init efi_memmap_init(void) -{ - unsigned long addr, size; - - if (efi_enabled(EFI_PARAVIRT)) - return 0; - - /* Map the EFI memory map */ - size = efi.memmap.nr_map * efi.memmap.desc_size; - addr = (unsigned long)efi.memmap.phys_map; - - efi.memmap.map = early_memremap(addr, size); - if (efi.memmap.map == NULL) { - pr_err("Could not map the memory map!\n"); - return -ENOMEM; - } - - efi.memmap.map_end = efi.memmap.map + size; - - if (add_efi_memmap) - do_add_efi_memmap(); - - set_bit(EFI_MEMMAP, &efi.flags); - - return 0; -} - void __init efi_init(void) { efi_char16_t *c16; @@ -498,16 +466,14 @@ void __init efi_init(void) if (!efi_runtime_supported()) pr_info("No EFI runtime due to 32/64-bit mismatch with kernel\n"); else { - if (efi_runtime_disabled() || efi_runtime_init()) + if (efi_runtime_disabled() || efi_runtime_init()) { + efi_memmap_unmap(); return; + } } - if (efi_memmap_init()) - return; if (efi_enabled(EFI_DBG)) efi_print_memmap(); - - efi_esrt_init(); } void __init efi_late_init(void) @@ -624,42 +590,6 @@ static void __init get_systab_virt_addr(efi_memory_desc_t *md) } } -static void __init save_runtime_map(void) -{ -#ifdef CONFIG_KEXEC_CORE - unsigned long desc_size; - efi_memory_desc_t *md; - void *tmp, *q = NULL; - int count = 0; - - if (efi_enabled(EFI_OLD_MEMMAP)) - return; - - desc_size = efi.memmap.desc_size; - - for_each_efi_memory_desc(md) { - if (!(md->attribute & EFI_MEMORY_RUNTIME) || - (md->type == EFI_BOOT_SERVICES_CODE) || - (md->type == EFI_BOOT_SERVICES_DATA)) - continue; - tmp = krealloc(q, (count + 1) * desc_size, GFP_KERNEL); - if (!tmp) - goto out; - q = tmp; - - memcpy(q + count * desc_size, md, desc_size); - count++; - } - - efi_runtime_map_setup(q, count, desc_size); - return; - -out: - kfree(q); - pr_err("Error saving runtime map, efi runtime on kexec non-functional!!\n"); -#endif -} - static void *realloc_pages(void *old_memmap, int old_shift) { void *ret; @@ -745,6 +675,46 @@ static void *efi_map_next_entry(void *entry) return entry; } +static bool should_map_region(efi_memory_desc_t *md) +{ + /* + * Runtime regions always require runtime mappings (obviously). + */ + if (md->attribute & EFI_MEMORY_RUNTIME) + return true; + + /* + * 32-bit EFI doesn't suffer from the bug that requires us to + * reserve boot services regions, and mixed mode support + * doesn't exist for 32-bit kernels. + */ + if (IS_ENABLED(CONFIG_X86_32)) + return false; + + /* + * Map all of RAM so that we can access arguments in the 1:1 + * mapping when making EFI runtime calls. + */ + if (IS_ENABLED(CONFIG_EFI_MIXED) && !efi_is_native()) { + if (md->type == EFI_CONVENTIONAL_MEMORY || + md->type == EFI_LOADER_DATA || + md->type == EFI_LOADER_CODE) + return true; + } + + /* + * Map boot services regions as a workaround for buggy + * firmware that accesses them even when they shouldn't. + * + * See efi_{reserve,free}_boot_services(). + */ + if (md->type == EFI_BOOT_SERVICES_CODE || + md->type == EFI_BOOT_SERVICES_DATA) + return true; + + return false; +} + /* * Map the efi memory ranges of the runtime services and update new_mmap with * virtual addresses. @@ -761,13 +731,9 @@ static void * __init efi_map_regions(int *count, int *pg_shift) p = NULL; while ((p = efi_map_next_entry(p))) { md = p; - if (!(md->attribute & EFI_MEMORY_RUNTIME)) { -#ifdef CONFIG_X86_64 - if (md->type != EFI_BOOT_SERVICES_CODE && - md->type != EFI_BOOT_SERVICES_DATA) -#endif - continue; - } + + if (!should_map_region(md)) + continue; efi_map_region(md); get_systab_virt_addr(md); @@ -803,7 +769,7 @@ static void __init kexec_enter_virtual_mode(void) * non-native EFI */ if (!efi_is_native()) { - efi_unmap_memmap(); + efi_memmap_unmap(); clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; } @@ -823,7 +789,18 @@ static void __init kexec_enter_virtual_mode(void) get_systab_virt_addr(md); } - save_runtime_map(); + /* + * Unregister the early EFI memmap from efi_init() and install + * the new EFI memory map. + */ + efi_memmap_unmap(); + + if (efi_memmap_init_late(efi.memmap.phys_map, + efi.memmap.desc_size * efi.memmap.nr_map)) { + pr_err("Failed to remap late EFI memory map\n"); + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); + return; + } BUG_ON(!efi.systab); @@ -884,6 +861,7 @@ static void __init __efi_enter_virtual_mode(void) int count = 0, pg_shift = 0; void *new_memmap = NULL; efi_status_t status; + phys_addr_t pa; efi.systab = NULL; @@ -901,11 +879,24 @@ static void __init __efi_enter_virtual_mode(void) return; } - save_runtime_map(); + pa = __pa(new_memmap); + + /* + * Unregister the early EFI memmap from efi_init() and install + * the new EFI memory map that we are about to pass to the + * firmware via SetVirtualAddressMap(). + */ + efi_memmap_unmap(); + + if (efi_memmap_init_late(pa, efi.memmap.desc_size * count)) { + pr_err("Failed to remap late EFI memory map\n"); + clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); + return; + } BUG_ON(!efi.systab); - if (efi_setup_page_tables(__pa(new_memmap), 1 << pg_shift)) { + if (efi_setup_page_tables(pa, 1 << pg_shift)) { clear_bit(EFI_RUNTIME_SERVICES, &efi.flags); return; } @@ -917,14 +908,14 @@ static void __init __efi_enter_virtual_mode(void) efi.memmap.desc_size * count, efi.memmap.desc_size, efi.memmap.desc_version, - (efi_memory_desc_t *)__pa(new_memmap)); + (efi_memory_desc_t *)pa); } else { status = efi_thunk_set_virtual_address_map( efi_phys.set_virtual_address_map, efi.memmap.desc_size * count, efi.memmap.desc_size, efi.memmap.desc_version, - (efi_memory_desc_t *)__pa(new_memmap)); + (efi_memory_desc_t *)pa); } if (status != EFI_SUCCESS) { @@ -956,15 +947,6 @@ static void __init __efi_enter_virtual_mode(void) efi_runtime_update_mappings(); efi_dump_pagetable(); - /* - * We mapped the descriptor array into the EFI pagetable above - * but we're not unmapping it here because if we're running in - * EFI mixed mode we need all of memory to be accessible when - * we pass parameters to the EFI runtime services in the - * thunking code. - */ - free_pages((unsigned long)new_memmap, pg_shift); - /* clean DUMMY object */ efi_delete_dummy_variable(); } diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c index 677e29e..58b0f80 100644 --- a/arch/x86/platform/efi/efi_64.c +++ b/arch/x86/platform/efi/efi_64.c @@ -85,7 +85,7 @@ pgd_t * __init efi_call_phys_prolog(void) early_code_mapping_set_exec(1); n_pgds = DIV_ROUND_UP((max_pfn << PAGE_SHIFT), PGDIR_SIZE); - save_pgd = kmalloc(n_pgds * sizeof(pgd_t), GFP_KERNEL); + save_pgd = kmalloc_array(n_pgds, sizeof(*save_pgd), GFP_KERNEL); for (pgd = 0; pgd < n_pgds; pgd++) { save_pgd[pgd] = *pgd_offset_k(pgd * PGDIR_SIZE); @@ -214,7 +214,6 @@ void efi_sync_low_kernel_mappings(void) int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) { unsigned long pfn, text; - efi_memory_desc_t *md; struct page *page; unsigned npages; pgd_t *pgd; @@ -245,28 +244,9 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages) * text and allocate a new stack because we can't rely on the * stack pointer being < 4GB. */ - if (!IS_ENABLED(CONFIG_EFI_MIXED)) + if (!IS_ENABLED(CONFIG_EFI_MIXED) || efi_is_native()) return 0; - /* - * Map all of RAM so that we can access arguments in the 1:1 - * mapping when making EFI runtime calls. - */ - for_each_efi_memory_desc(md) { - if (md->type != EFI_CONVENTIONAL_MEMORY && - md->type != EFI_LOADER_DATA && - md->type != EFI_LOADER_CODE) - continue; - - pfn = md->phys_addr >> PAGE_SHIFT; - npages = md->num_pages; - - if (kernel_map_pages_in_pgd(pgd, pfn, md->phys_addr, npages, _PAGE_RW)) { - pr_err("Failed to map 1:1 memory\n"); - return 1; - } - } - page = alloc_page(GFP_KERNEL|__GFP_DMA32); if (!page) panic("Unable to allocate EFI runtime stack < 4GB\n"); @@ -359,6 +339,7 @@ void __init efi_map_region(efi_memory_desc_t *md) */ void __init efi_map_region_fixed(efi_memory_desc_t *md) { + __map_region(md, md->phys_addr); __map_region(md, md->virt_addr); } diff --git a/arch/x86/platform/efi/quirks.c b/arch/x86/platform/efi/quirks.c index 89d1146..10aca63 100644 --- a/arch/x86/platform/efi/quirks.c +++ b/arch/x86/platform/efi/quirks.c @@ -164,6 +164,75 @@ efi_status_t efi_query_variable_store(u32 attributes, unsigned long size, EXPORT_SYMBOL_GPL(efi_query_variable_store); /* + * The UEFI specification makes it clear that the operating system is + * free to do whatever it wants with boot services code after + * ExitBootServices() has been called. Ignoring this recommendation a + * significant bunch of EFI implementations continue calling into boot + * services code (SetVirtualAddressMap). In order to work around such + * buggy implementations we reserve boot services region during EFI + * init and make sure it stays executable. Then, after + * SetVirtualAddressMap(), it is discarded. + * + * However, some boot services regions contain data that is required + * by drivers, so we need to track which memory ranges can never be + * freed. This is done by tagging those regions with the + * EFI_MEMORY_RUNTIME attribute. + * + * Any driver that wants to mark a region as reserved must use + * efi_mem_reserve() which will insert a new EFI memory descriptor + * into efi.memmap (splitting existing regions if necessary) and tag + * it with EFI_MEMORY_RUNTIME. + */ +void __init efi_arch_mem_reserve(phys_addr_t addr, u64 size) +{ + phys_addr_t new_phys, new_size; + struct efi_mem_range mr; + efi_memory_desc_t md; + int num_entries; + void *new; + + if (efi_mem_desc_lookup(addr, &md)) { + pr_err("Failed to lookup EFI memory descriptor for %pa\n", &addr); + return; + } + + if (addr + size > md.phys_addr + (md.num_pages << EFI_PAGE_SHIFT)) { + pr_err("Region spans EFI memory descriptors, %pa\n", &addr); + return; + } + + size += addr % EFI_PAGE_SIZE; + size = round_up(size, EFI_PAGE_SIZE); + addr = round_down(addr, EFI_PAGE_SIZE); + + mr.range.start = addr; + mr.range.end = addr + size - 1; + mr.attribute = md.attribute | EFI_MEMORY_RUNTIME; + + num_entries = efi_memmap_split_count(&md, &mr.range); + num_entries += efi.memmap.nr_map; + + new_size = efi.memmap.desc_size * num_entries; + + new_phys = memblock_alloc(new_size, 0); + if (!new_phys) { + pr_err("Could not allocate boot services memmap\n"); + return; + } + + new = early_memremap(new_phys, new_size); + if (!new) { + pr_err("Failed to map new boot services memmap\n"); + return; + } + + efi_memmap_insert(&efi.memmap, new, &mr); + early_memunmap(new, new_size); + + efi_memmap_install(new_phys, num_entries); +} + +/* * Helper function for efi_reserve_boot_services() to figure out if we * can free regions in efi_free_boot_services(). * @@ -184,15 +253,6 @@ static bool can_free_region(u64 start, u64 size) return true; } -/* - * The UEFI specification makes it clear that the operating system is free to do - * whatever it wants with boot services code after ExitBootServices() has been - * called. Ignoring this recommendation a significant bunch of EFI implementations - * continue calling into boot services code (SetVirtualAddressMap). In order to - * work around such buggy implementations we reserve boot services region during - * EFI init and make sure it stays executable. Then, after SetVirtualAddressMap(), it -* is discarded. -*/ void __init efi_reserve_boot_services(void) { efi_memory_desc_t *md; @@ -249,7 +309,10 @@ void __init efi_reserve_boot_services(void) void __init efi_free_boot_services(void) { + phys_addr_t new_phys, new_size; efi_memory_desc_t *md; + int num_entries = 0; + void *new, *new_md; for_each_efi_memory_desc(md) { unsigned long long start = md->phys_addr; @@ -257,12 +320,16 @@ void __init efi_free_boot_services(void) size_t rm_size; if (md->type != EFI_BOOT_SERVICES_CODE && - md->type != EFI_BOOT_SERVICES_DATA) + md->type != EFI_BOOT_SERVICES_DATA) { + num_entries++; continue; + } /* Do not free, someone else owns it: */ - if (md->attribute & EFI_MEMORY_RUNTIME) + if (md->attribute & EFI_MEMORY_RUNTIME) { + num_entries++; continue; + } /* * Nasty quirk: if all sub-1MB memory is used for boot @@ -287,7 +354,41 @@ void __init efi_free_boot_services(void) free_bootmem_late(start, size); } - efi_unmap_memmap(); + new_size = efi.memmap.desc_size * num_entries; + new_phys = memblock_alloc(new_size, 0); + if (!new_phys) { + pr_err("Failed to allocate new EFI memmap\n"); + return; + } + + new = memremap(new_phys, new_size, MEMREMAP_WB); + if (!new) { + pr_err("Failed to map new EFI memmap\n"); + return; + } + + /* + * Build a new EFI memmap that excludes any boot services + * regions that are not tagged EFI_MEMORY_RUNTIME, since those + * regions have now been freed. + */ + new_md = new; + for_each_efi_memory_desc(md) { + if (!(md->attribute & EFI_MEMORY_RUNTIME) && + (md->type == EFI_BOOT_SERVICES_CODE || + md->type == EFI_BOOT_SERVICES_DATA)) + continue; + + memcpy(new_md, md, efi.memmap.desc_size); + new_md += efi.memmap.desc_size; + } + + memunmap(new); + + if (efi_memmap_install(new_phys, num_entries)) { + pr_err("Could not install new EFI memmap\n"); + return; + } } /* @@ -365,7 +466,7 @@ void __init efi_apply_memmap_quirks(void) */ if (!efi_runtime_supported()) { pr_info("Setup done, disabling due to 32/64-bit mismatch\n"); - efi_unmap_memmap(); + efi_memmap_unmap(); } /* UV2+ BIOS has a fix for this issue. UV1 still needs the quirk. */ diff --git a/arch/x86/ras/mce_amd_inj.c b/arch/x86/ras/mce_amd_inj.c index 1104515..1ac7647 100644 --- a/arch/x86/ras/mce_amd_inj.c +++ b/arch/x86/ras/mce_amd_inj.c @@ -68,6 +68,7 @@ static int inj_##reg##_set(void *data, u64 val) \ MCE_INJECT_SET(status); MCE_INJECT_SET(misc); MCE_INJECT_SET(addr); +MCE_INJECT_SET(synd); #define MCE_INJECT_GET(reg) \ static int inj_##reg##_get(void *data, u64 *val) \ @@ -81,10 +82,12 @@ static int inj_##reg##_get(void *data, u64 *val) \ MCE_INJECT_GET(status); MCE_INJECT_GET(misc); MCE_INJECT_GET(addr); +MCE_INJECT_GET(synd); DEFINE_SIMPLE_ATTRIBUTE(status_fops, inj_status_get, inj_status_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(misc_fops, inj_misc_get, inj_misc_set, "%llx\n"); DEFINE_SIMPLE_ATTRIBUTE(addr_fops, inj_addr_get, inj_addr_set, "%llx\n"); +DEFINE_SIMPLE_ATTRIBUTE(synd_fops, inj_synd_get, inj_synd_set, "%llx\n"); /* * Caller needs to be make sure this cpu doesn't disappear @@ -243,27 +246,27 @@ static void toggle_nb_mca_mst_cpu(u16 nid) static void prepare_msrs(void *info) { - struct mce i_mce = *(struct mce *)info; - u8 b = i_mce.bank; + struct mce m = *(struct mce *)info; + u8 b = m.bank; - wrmsrl(MSR_IA32_MCG_STATUS, i_mce.mcgstatus); + wrmsrl(MSR_IA32_MCG_STATUS, m.mcgstatus); if (boot_cpu_has(X86_FEATURE_SMCA)) { - if (i_mce.inject_flags == DFR_INT_INJ) { - wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), i_mce.status); - wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), i_mce.addr); + if (m.inject_flags == DFR_INT_INJ) { + wrmsrl(MSR_AMD64_SMCA_MCx_DESTAT(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_DEADDR(b), m.addr); } else { - wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), i_mce.status); - wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), i_mce.addr); + wrmsrl(MSR_AMD64_SMCA_MCx_STATUS(b), m.status); + wrmsrl(MSR_AMD64_SMCA_MCx_ADDR(b), m.addr); } - wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), i_mce.misc); + wrmsrl(MSR_AMD64_SMCA_MCx_MISC(b), m.misc); + wrmsrl(MSR_AMD64_SMCA_MCx_SYND(b), m.synd); } else { - wrmsrl(MSR_IA32_MCx_STATUS(b), i_mce.status); - wrmsrl(MSR_IA32_MCx_ADDR(b), i_mce.addr); - wrmsrl(MSR_IA32_MCx_MISC(b), i_mce.misc); + wrmsrl(MSR_IA32_MCx_STATUS(b), m.status); + wrmsrl(MSR_IA32_MCx_ADDR(b), m.addr); + wrmsrl(MSR_IA32_MCx_MISC(b), m.misc); } - } static void do_inject(void) @@ -275,6 +278,9 @@ static void do_inject(void) if (i_mce.misc) i_mce.status |= MCI_STATUS_MISCV; + if (i_mce.synd) + i_mce.status |= MCI_STATUS_SYNDV; + if (inj_type == SW_INJ) { mce_inject_log(&i_mce); return; @@ -301,7 +307,9 @@ static void do_inject(void) * only on the node base core. Refer to D18F3x44[NbMcaToMstCpuEn] for * Fam10h and later BKDGs. */ - if (static_cpu_has(X86_FEATURE_AMD_DCM) && b == 4) { + if (static_cpu_has(X86_FEATURE_AMD_DCM) && + b == 4 && + boot_cpu_data.x86 < 0x17) { toggle_nb_mca_mst_cpu(amd_get_nb_id(cpu)); cpu = get_nbc_for_node(amd_get_nb_id(cpu)); } @@ -371,6 +379,9 @@ static const char readme_msg[] = "\t used for error thresholding purposes and its validity is indicated by\n" "\t MCi_STATUS[MiscV].\n" "\n" +"synd:\t Set MCi_SYND: provide syndrome info about the error. Only valid on\n" +"\t Scalable MCA systems, and its validity is indicated by MCi_STATUS[SyndV].\n" +"\n" "addr:\t Error address value to be written to MCi_ADDR. Log address information\n" "\t associated with the error.\n" "\n" @@ -420,6 +431,7 @@ static struct dfs_node { { .name = "status", .fops = &status_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "misc", .fops = &misc_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "addr", .fops = &addr_fops, .perm = S_IRUSR | S_IWUSR }, + { .name = "synd", .fops = &synd_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "bank", .fops = &bank_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "flags", .fops = &flags_fops, .perm = S_IRUSR | S_IWUSR }, { .name = "cpu", .fops = &extcpu_fops, .perm = S_IRUSR | S_IWUSR }, @@ -428,7 +440,7 @@ static struct dfs_node { static int __init init_mce_inject(void) { - int i; + unsigned int i; u64 cap; rdmsrl(MSR_IA32_MCG_CAP, cap); @@ -452,26 +464,22 @@ static int __init init_mce_inject(void) return 0; err_dfs_add: - while (--i >= 0) + while (i-- > 0) debugfs_remove(dfs_fls[i].d); debugfs_remove(dfs_inj); dfs_inj = NULL; - return -ENOMEM; + return -ENODEV; } static void __exit exit_mce_inject(void) { - int i; - for (i = 0; i < ARRAY_SIZE(dfs_fls); i++) - debugfs_remove(dfs_fls[i].d); + debugfs_remove_recursive(dfs_inj); + dfs_inj = NULL; memset(&dfs_fls, 0, sizeof(dfs_fls)); - - debugfs_remove(dfs_inj); - dfs_inj = NULL; } module_init(init_mce_inject); module_exit(exit_mce_inject); diff --git a/arch/x86/um/ptrace_32.c b/arch/x86/um/ptrace_32.c index ebd4dd6..a7ef7b1 100644 --- a/arch/x86/um/ptrace_32.c +++ b/arch/x86/um/ptrace_32.c @@ -84,7 +84,10 @@ int putreg(struct task_struct *child, int regno, unsigned long value) case EAX: case EIP: case UESP: + break; case ORIG_EAX: + /* Update the syscall number. */ + UPT_SYSCALL_NR(&child->thread.regs.regs) = value; break; case FS: if (value && (value & 3) != 3) diff --git a/arch/x86/um/ptrace_64.c b/arch/x86/um/ptrace_64.c index faab418..0b5c184 100644 --- a/arch/x86/um/ptrace_64.c +++ b/arch/x86/um/ptrace_64.c @@ -78,7 +78,11 @@ int putreg(struct task_struct *child, int regno, unsigned long value) case RSI: case RDI: case RBP: + break; + case ORIG_RAX: + /* Update the syscall number. */ + UPT_SYSCALL_NR(&child->thread.regs.regs) = value; break; case FS: diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index b86ebb1..bc9aaba 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1925,6 +1925,45 @@ static void xen_set_cpu_features(struct cpuinfo_x86 *c) } } +static void xen_pin_vcpu(int cpu) +{ + static bool disable_pinning; + struct sched_pin_override pin_override; + int ret; + + if (disable_pinning) + return; + + pin_override.pcpu = cpu; + ret = HYPERVISOR_sched_op(SCHEDOP_pin_override, &pin_override); + + /* Ignore errors when removing override. */ + if (cpu < 0) + return; + + switch (ret) { + case -ENOSYS: + pr_warn("Unable to pin on physical cpu %d. In case of problems consider vcpu pinning.\n", + cpu); + disable_pinning = true; + break; + case -EPERM: + WARN(1, "Trying to pin vcpu without having privilege to do so\n"); + disable_pinning = true; + break; + case -EINVAL: + case -EBUSY: + pr_warn("Physical cpu %d not available for pinning. Check Xen cpu configuration.\n", + cpu); + break; + case 0: + break; + default: + WARN(1, "rc %d while trying to pin vcpu\n", ret); + disable_pinning = true; + } +} + const struct hypervisor_x86 x86_hyper_xen = { .name = "Xen", .detect = xen_platform, @@ -1933,6 +1972,7 @@ const struct hypervisor_x86 x86_hyper_xen = { #endif .x2apic_available = xen_x2apic_para_available, .set_cpu_features = xen_set_cpu_features, + .pin_vcpu = xen_pin_vcpu, }; EXPORT_SYMBOL(x86_hyper_xen); diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index f42e78d..3d6e006 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -21,8 +21,6 @@ static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; static DEFINE_PER_CPU(char *, irq_name); static bool xen_pvspin = true; -#ifdef CONFIG_QUEUED_SPINLOCKS - #include <asm/qspinlock.h> static void xen_qlock_kick(int cpu) @@ -71,207 +69,6 @@ static void xen_qlock_wait(u8 *byte, u8 val) xen_poll_irq(irq); } -#else /* CONFIG_QUEUED_SPINLOCKS */ - -enum xen_contention_stat { - TAKEN_SLOW, - TAKEN_SLOW_PICKUP, - TAKEN_SLOW_SPURIOUS, - RELEASED_SLOW, - RELEASED_SLOW_KICKED, - NR_CONTENTION_STATS -}; - - -#ifdef CONFIG_XEN_DEBUG_FS -#define HISTO_BUCKETS 30 -static struct xen_spinlock_stats -{ - u32 contention_stats[NR_CONTENTION_STATS]; - u32 histo_spin_blocked[HISTO_BUCKETS+1]; - u64 time_blocked; -} spinlock_stats; - -static u8 zero_stats; - -static inline void check_zero(void) -{ - u8 ret; - u8 old = READ_ONCE(zero_stats); - if (unlikely(old)) { - ret = cmpxchg(&zero_stats, old, 0); - /* This ensures only one fellow resets the stat */ - if (ret == old) - memset(&spinlock_stats, 0, sizeof(spinlock_stats)); - } -} - -static inline void add_stats(enum xen_contention_stat var, u32 val) -{ - check_zero(); - spinlock_stats.contention_stats[var] += val; -} - -static inline u64 spin_time_start(void) -{ - return xen_clocksource_read(); -} - -static void __spin_time_accum(u64 delta, u32 *array) -{ - unsigned index = ilog2(delta); - - check_zero(); - - if (index < HISTO_BUCKETS) - array[index]++; - else - array[HISTO_BUCKETS]++; -} - -static inline void spin_time_accum_blocked(u64 start) -{ - u32 delta = xen_clocksource_read() - start; - - __spin_time_accum(delta, spinlock_stats.histo_spin_blocked); - spinlock_stats.time_blocked += delta; -} -#else /* !CONFIG_XEN_DEBUG_FS */ -static inline void add_stats(enum xen_contention_stat var, u32 val) -{ -} - -static inline u64 spin_time_start(void) -{ - return 0; -} - -static inline void spin_time_accum_blocked(u64 start) -{ -} -#endif /* CONFIG_XEN_DEBUG_FS */ - -struct xen_lock_waiting { - struct arch_spinlock *lock; - __ticket_t want; -}; - -static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting); -static cpumask_t waiting_cpus; - -__visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) -{ - int irq = __this_cpu_read(lock_kicker_irq); - struct xen_lock_waiting *w = this_cpu_ptr(&lock_waiting); - int cpu = smp_processor_id(); - u64 start; - __ticket_t head; - unsigned long flags; - - /* If kicker interrupts not initialized yet, just spin */ - if (irq == -1) - return; - - start = spin_time_start(); - - /* - * Make sure an interrupt handler can't upset things in a - * partially setup state. - */ - local_irq_save(flags); - /* - * We don't really care if we're overwriting some other - * (lock,want) pair, as that would mean that we're currently - * in an interrupt context, and the outer context had - * interrupts enabled. That has already kicked the VCPU out - * of xen_poll_irq(), so it will just return spuriously and - * retry with newly setup (lock,want). - * - * The ordering protocol on this is that the "lock" pointer - * may only be set non-NULL if the "want" ticket is correct. - * If we're updating "want", we must first clear "lock". - */ - w->lock = NULL; - smp_wmb(); - w->want = want; - smp_wmb(); - w->lock = lock; - - /* This uses set_bit, which atomic and therefore a barrier */ - cpumask_set_cpu(cpu, &waiting_cpus); - add_stats(TAKEN_SLOW, 1); - - /* clear pending */ - xen_clear_irq_pending(irq); - - /* Only check lock once pending cleared */ - barrier(); - - /* - * Mark entry to slowpath before doing the pickup test to make - * sure we don't deadlock with an unlocker. - */ - __ticket_enter_slowpath(lock); - - /* make sure enter_slowpath, which is atomic does not cross the read */ - smp_mb__after_atomic(); - - /* - * check again make sure it didn't become free while - * we weren't looking - */ - head = READ_ONCE(lock->tickets.head); - if (__tickets_equal(head, want)) { - add_stats(TAKEN_SLOW_PICKUP, 1); - goto out; - } - - /* Allow interrupts while blocked */ - local_irq_restore(flags); - - /* - * If an interrupt happens here, it will leave the wakeup irq - * pending, which will cause xen_poll_irq() to return - * immediately. - */ - - /* Block until irq becomes pending (or perhaps a spurious wakeup) */ - xen_poll_irq(irq); - add_stats(TAKEN_SLOW_SPURIOUS, !xen_test_irq_pending(irq)); - - local_irq_save(flags); - - kstat_incr_irq_this_cpu(irq); -out: - cpumask_clear_cpu(cpu, &waiting_cpus); - w->lock = NULL; - - local_irq_restore(flags); - - spin_time_accum_blocked(start); -} -PV_CALLEE_SAVE_REGS_THUNK(xen_lock_spinning); - -static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next) -{ - int cpu; - - add_stats(RELEASED_SLOW, 1); - - for_each_cpu(cpu, &waiting_cpus) { - const struct xen_lock_waiting *w = &per_cpu(lock_waiting, cpu); - - /* Make sure we read lock before want */ - if (READ_ONCE(w->lock) == lock && - READ_ONCE(w->want) == next) { - add_stats(RELEASED_SLOW_KICKED, 1); - xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); - break; - } - } -} -#endif /* CONFIG_QUEUED_SPINLOCKS */ - static irqreturn_t dummy_handler(int irq, void *dev_id) { BUG(); @@ -334,16 +131,12 @@ void __init xen_init_spinlocks(void) return; } printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); -#ifdef CONFIG_QUEUED_SPINLOCKS + __pv_init_lock_hash(); pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); pv_lock_ops.wait = xen_qlock_wait; pv_lock_ops.kick = xen_qlock_kick; -#else - pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); - pv_lock_ops.unlock_kick = xen_unlock_kick; -#endif } /* @@ -372,44 +165,3 @@ static __init int xen_parse_nopvspin(char *arg) } early_param("xen_nopvspin", xen_parse_nopvspin); -#if defined(CONFIG_XEN_DEBUG_FS) && !defined(CONFIG_QUEUED_SPINLOCKS) - -static struct dentry *d_spin_debug; - -static int __init xen_spinlock_debugfs(void) -{ - struct dentry *d_xen = xen_init_debugfs(); - - if (d_xen == NULL) - return -ENOMEM; - - if (!xen_pvspin) - return 0; - - d_spin_debug = debugfs_create_dir("spinlocks", d_xen); - - debugfs_create_u8("zero_stats", 0644, d_spin_debug, &zero_stats); - - debugfs_create_u32("taken_slow", 0444, d_spin_debug, - &spinlock_stats.contention_stats[TAKEN_SLOW]); - debugfs_create_u32("taken_slow_pickup", 0444, d_spin_debug, - &spinlock_stats.contention_stats[TAKEN_SLOW_PICKUP]); - debugfs_create_u32("taken_slow_spurious", 0444, d_spin_debug, - &spinlock_stats.contention_stats[TAKEN_SLOW_SPURIOUS]); - - debugfs_create_u32("released_slow", 0444, d_spin_debug, - &spinlock_stats.contention_stats[RELEASED_SLOW]); - debugfs_create_u32("released_slow_kicked", 0444, d_spin_debug, - &spinlock_stats.contention_stats[RELEASED_SLOW_KICKED]); - - debugfs_create_u64("time_blocked", 0444, d_spin_debug, - &spinlock_stats.time_blocked); - - debugfs_create_u32_array("histo_blocked", 0444, d_spin_debug, - spinlock_stats.histo_spin_blocked, HISTO_BUCKETS + 1); - - return 0; -} -fs_initcall(xen_spinlock_debugfs); - -#endif /* CONFIG_XEN_DEBUG_FS */ |