diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-17 14:23:52 -0800 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-11-17 14:23:52 -0800 |
commit | e71d5126e77c68b7ed1fad275fdc8825e9597448 (patch) | |
tree | 038f23568881d65cb269c9c4b92e65a2d7131456 /arch/s390/kernel | |
parent | c3e9c04b89059a4c93c792da883ca284de182da5 (diff) | |
parent | ab35727eb879ccc9487c6df0a3796be124ed39d3 (diff) | |
download | op-kernel-dev-e71d5126e77c68b7ed1fad275fdc8825e9597448.zip op-kernel-dev-e71d5126e77c68b7ed1fad275fdc8825e9597448.tar.gz |
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux
Pull second round of s390 updates from Martin Schwidefsky:
- rework of the vdso code to avoid the use of the access register mode
- use perf AUX buffers for the transport of diagnostic sample data
- add perf_regs and user stack dump support
- enable perf call graphs for user space programs
- add perf register support for floating-point registers
- all remaining s390 related timer_setup conversions
- bug fixes and cleanups
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux: (30 commits)
s390: remove unused parameter from Makefile
zfcp: purely mechanical update using timer API, plus blank lines
s390/scsi: Convert timers to use timer_setup()
s390/cpum_sf: correctly set the PID and TID in perf samples
s390/cpum_sf: load program parameter at sampler enablement
s390/perf: add perf register support for floating-point registers
s390/perf: extend perf_regs support to include floating-point registers
s390/perf: define common DWARF register string table
s390/perf: add support for perf_regs and libdw
s390/perf: add perf_regs support and user stack dump
s390/cpum_sf: do not register PMU if no sampling mode is authorized
s390/cpumf: remove raw event support in basic-only sampling mode
s390/perf: add callback to perf to enable using AUX buffer
s390/cpumf: enable using AUX buffer
s390/cpumf: introduce AUX buffer for dump diagnostic sample data
s390/disassembler: increase show_code buffer size
s390: Remove CONFIG_HARDENED_USERCOPY
s390: enable CPU alternatives unconditionally
s390/nmi: remove unused code
s390/mm: remove unused code
...
Diffstat (limited to 'arch/s390/kernel')
-rw-r--r-- | arch/s390/kernel/Makefile | 5 | ||||
-rw-r--r-- | arch/s390/kernel/asm-offsets.c | 2 | ||||
-rw-r--r-- | arch/s390/kernel/dis.c | 4 | ||||
-rw-r--r-- | arch/s390/kernel/entry.S | 33 | ||||
-rw-r--r-- | arch/s390/kernel/head64.S | 2 | ||||
-rw-r--r-- | arch/s390/kernel/module.c | 15 | ||||
-rw-r--r-- | arch/s390/kernel/nmi.c | 2 | ||||
-rw-r--r-- | arch/s390/kernel/perf_cpum_sf.c | 768 | ||||
-rw-r--r-- | arch/s390/kernel/perf_regs.c | 70 | ||||
-rw-r--r-- | arch/s390/kernel/vdso.c | 44 | ||||
-rw-r--r-- | arch/s390/kernel/vdso32/getcpu.S | 16 | ||||
-rw-r--r-- | arch/s390/kernel/vdso64/clock_gettime.S | 19 | ||||
-rw-r--r-- | arch/s390/kernel/vdso64/getcpu.S | 15 |
13 files changed, 719 insertions, 276 deletions
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile index 83bc820..909bce6 100644 --- a/arch/s390/kernel/Makefile +++ b/arch/s390/kernel/Makefile @@ -59,7 +59,7 @@ obj-y += processor.o sys_s390.o ptrace.o signal.o cpcmd.o ebcdic.o nmi.o obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o als.o obj-y += sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o -obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o +obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o extra-y += head.o head64.o vmlinux.lds @@ -77,10 +77,9 @@ obj-$(CONFIG_KPROBES) += kprobes.o obj-$(CONFIG_FUNCTION_TRACER) += mcount.o ftrace.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_UPROBES) += uprobes.o -obj-$(CONFIG_ALTERNATIVES) += alternative.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o perf_cpum_cf.o perf_cpum_sf.o -obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o +obj-$(CONFIG_PERF_EVENTS) += perf_cpum_cf_events.o perf_regs.o obj-$(CONFIG_TRACEPOINTS) += trace.o diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c index 33ec80d..587b195 100644 --- a/arch/s390/kernel/asm-offsets.c +++ b/arch/s390/kernel/asm-offsets.c @@ -171,6 +171,7 @@ int main(void) OFFSET(__LC_RESTART_DATA, lowcore, restart_data); OFFSET(__LC_RESTART_SOURCE, lowcore, restart_source); OFFSET(__LC_USER_ASCE, lowcore, user_asce); + OFFSET(__LC_VDSO_ASCE, lowcore, vdso_asce); OFFSET(__LC_LPP, lowcore, lpp); OFFSET(__LC_CURRENT_PID, lowcore, current_pid); OFFSET(__LC_PERCPU_OFFSET, lowcore, percpu_offset); @@ -178,7 +179,6 @@ int main(void) OFFSET(__LC_MACHINE_FLAGS, lowcore, machine_flags); OFFSET(__LC_PREEMPT_COUNT, lowcore, preempt_count); OFFSET(__LC_GMAP, lowcore, gmap); - OFFSET(__LC_PASTE, lowcore, paste); /* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */ OFFSET(__LC_DUMP_REIPL, lowcore, ipib); /* hardware defined lowcore locations 0x1000 - 0x18ff */ diff --git a/arch/s390/kernel/dis.c b/arch/s390/kernel/dis.c index b811d3a..3be8297 100644 --- a/arch/s390/kernel/dis.c +++ b/arch/s390/kernel/dis.c @@ -480,7 +480,7 @@ void show_code(struct pt_regs *regs) { char *mode = user_mode(regs) ? "User" : "Krnl"; unsigned char code[64]; - char buffer[64], *ptr; + char buffer[128], *ptr; mm_segment_t old_fs; unsigned long addr; int start, end, opsize, hops, i; @@ -543,7 +543,7 @@ void show_code(struct pt_regs *regs) start += opsize; pr_cont("%s", buffer); ptr = buffer; - ptr += sprintf(ptr, "\n "); + ptr += sprintf(ptr, "\n\t "); hops++; } pr_cont("\n"); diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S index f498d20..a316cd6 100644 --- a/arch/s390/kernel/entry.S +++ b/arch/s390/kernel/entry.S @@ -379,13 +379,21 @@ ENTRY(system_call) jg s390_handle_mcck # TIF bit will be cleared by handler # -# _CIF_ASCE_PRIMARY and/or CIF_ASCE_SECONDARY set, load user space asce +# _CIF_ASCE_PRIMARY and/or _CIF_ASCE_SECONDARY set, load user space asce # .Lsysc_asce: + ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_SECONDARY + lctlg %c7,%c7,__LC_VDSO_ASCE # load secondary asce + TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_PRIMARY + jz .Lsysc_return +#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES + tm __LC_STFLE_FAC_LIST+3,0x10 # has MVCOS ? + jnz .Lsysc_set_fs_fixup ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY lctlg %c1,%c1,__LC_USER_ASCE # load primary asce - TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_SECONDARY - jz .Lsysc_return + j .Lsysc_return +.Lsysc_set_fs_fixup: +#endif larl %r14,.Lsysc_return jg set_fs_fixup @@ -518,6 +526,7 @@ ENTRY(pgm_check_handler) stmg %r8,%r15,__LC_SAVE_AREA_SYNC lg %r10,__LC_LAST_BREAK lg %r12,__LC_CURRENT + lghi %r11,0 larl %r13,cleanup_critical lmg %r8,%r9,__LC_PGM_OLD_PSW tmhh %r8,0x0001 # test problem state bit @@ -532,6 +541,7 @@ ENTRY(pgm_check_handler) ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE lctlg %c1,%c1,__LC_USER_ASCE # load primary asce larl %r9,sie_exit # skip forward to sie_exit + lghi %r11,_PIF_GUEST_FAULT #endif 0: tmhh %r8,0x4000 # PER bit set in old PSW ? jnz 1f # -> enabled, can't be a double fault @@ -549,13 +559,14 @@ ENTRY(pgm_check_handler) jz 3f mvc __THREAD_trap_tdb(256,%r14),0(%r13) 3: stg %r10,__THREAD_last_break(%r14) -4: la %r11,STACK_FRAME_OVERHEAD(%r15) +4: lgr %r13,%r11 + la %r11,STACK_FRAME_OVERHEAD(%r15) stmg %r0,%r7,__PT_R0(%r11) mvc __PT_R8(64,%r11),__LC_SAVE_AREA_SYNC stmg %r8,%r9,__PT_PSW(%r11) mvc __PT_INT_CODE(4,%r11),__LC_PGM_ILC mvc __PT_INT_PARM_LONG(8,%r11),__LC_TRANS_EXC_CODE - xc __PT_FLAGS(8,%r11),__PT_FLAGS(%r11) + stg %r13,__PT_FLAGS(%r11) stg %r10,__PT_ARGS(%r11) tm __LC_PGM_ILC+3,0x80 # check for per exception jz 5f @@ -738,10 +749,18 @@ ENTRY(io_int_handler) # _CIF_ASCE_PRIMARY and/or CIF_ASCE_SECONDARY set, load user space asce # .Lio_asce: + ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_SECONDARY + lctlg %c7,%c7,__LC_VDSO_ASCE # load secondary asce + TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_PRIMARY + jz .Lio_return +#ifndef CONFIG_HAVE_MARCH_Z10_FEATURES + tm __LC_STFLE_FAC_LIST+3,0x10 # has MVCOS ? + jnz .Lio_set_fs_fixup ni __LC_CPU_FLAGS+7,255-_CIF_ASCE_PRIMARY lctlg %c1,%c1,__LC_USER_ASCE # load primary asce - TSTMSK __LC_CPU_FLAGS,_CIF_ASCE_SECONDARY - jz .Lio_return + j .Lio_return +.Lio_set_fs_fixup: +#endif larl %r14,.Lio_return jg set_fs_fixup diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S index 172002d..38a973c 100644 --- a/arch/s390/kernel/head64.S +++ b/arch/s390/kernel/head64.S @@ -28,7 +28,7 @@ ENTRY(startup_continue) lctlg %c0,%c15,.Lctl-.LPG1(%r13) # load control registers lg %r12,.Lparmaddr-.LPG1(%r13) # pointer to parameter area # move IPL device to lowcore - lghi %r0,__LC_PASTE + larl %r0,boot_vdso_data stg %r0,__LC_VDSO_PER_CPU # # Setup stack diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index 6d9f73b..7b87991 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -433,16 +433,13 @@ int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *s; char *secstrings; - if (IS_ENABLED(CONFIG_ALTERNATIVES)) { - secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; - for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { - if (!strcmp(".altinstructions", - secstrings + s->sh_name)) { - /* patch .altinstructions */ - void *aseg = (void *)s->sh_addr; + secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + if (!strcmp(".altinstructions", secstrings + s->sh_name)) { + /* patch .altinstructions */ + void *aseg = (void *)s->sh_addr; - apply_alternatives(aseg, aseg + s->sh_size); - } + apply_alternatives(aseg, aseg + s->sh_size); } } diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c index 3f3cda4..6ff1692 100644 --- a/arch/s390/kernel/nmi.c +++ b/arch/s390/kernel/nmi.c @@ -191,7 +191,6 @@ static int notrace s390_check_registers(union mci mci, int umode) { union ctlreg2 cr2; int kill_task; - void *fpt_save_area; kill_task = 0; @@ -224,7 +223,6 @@ static int notrace s390_check_registers(union mci mci, int umode) if (!test_cpu_flag(CIF_FPU)) kill_task = 1; } - fpt_save_area = &S390_lowcore.floating_pt_save_area; if (!mci.fc) { /* * Floating point control register can't be restored. diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c index bd4bbf6..227b38b 100644 --- a/arch/s390/kernel/perf_cpum_sf.c +++ b/arch/s390/kernel/perf_cpum_sf.c @@ -15,6 +15,7 @@ #include <linux/kernel_stat.h> #include <linux/perf_event.h> #include <linux/percpu.h> +#include <linux/pid.h> #include <linux/notifier.h> #include <linux/export.h> #include <linux/slab.h> @@ -77,6 +78,15 @@ struct sf_buffer { unsigned long *tail; /* last sample-data-block-table */ }; +struct aux_buffer { + struct sf_buffer sfb; + unsigned long head; /* index of SDB of buffer head */ + unsigned long alert_mark; /* index of SDB of alert request position */ + unsigned long empty_mark; /* mark of SDB not marked full */ + unsigned long *sdb_index; /* SDB address for fast lookup */ + unsigned long *sdbt_index; /* SDBT address for fast lookup */ +}; + struct cpu_hw_sf { /* CPU-measurement sampling information block */ struct hws_qsi_info_block qsi; @@ -85,6 +95,7 @@ struct cpu_hw_sf { struct sf_buffer sfb; /* Sampling buffer */ unsigned int flags; /* Status flags */ struct perf_event *event; /* Scheduled perf event */ + struct perf_output_handle handle; /* AUX buffer output handle */ }; static DEFINE_PER_CPU(struct cpu_hw_sf, cpu_hw_sf); @@ -341,22 +352,6 @@ static void sfb_init_allocs(unsigned long num, struct hw_perf_event *hwc) sfb_account_allocs(num, hwc); } -static size_t event_sample_size(struct hw_perf_event *hwc) -{ - struct sf_raw_sample *sfr = (struct sf_raw_sample *) RAWSAMPLE_REG(hwc); - size_t sample_size; - - /* The sample size depends on the sampling function: The basic-sampling - * function must be always enabled, diagnostic-sampling function is - * optional. - */ - sample_size = sfr->bsdes; - if (SAMPL_DIAG_MODE(hwc)) - sample_size += sfr->dsdes; - - return sample_size; -} - static void deallocate_buffers(struct cpu_hw_sf *cpuhw) { if (cpuhw->sfb.sdbt) @@ -366,35 +361,7 @@ static void deallocate_buffers(struct cpu_hw_sf *cpuhw) static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) { unsigned long n_sdb, freq, factor; - size_t sfr_size, sample_size; - struct sf_raw_sample *sfr; - - /* Allocate raw sample buffer - * - * The raw sample buffer is used to temporarily store sampling data - * entries for perf raw sample processing. The buffer size mainly - * depends on the size of diagnostic-sampling data entries which is - * machine-specific. The exact size calculation includes: - * 1. The first 4 bytes of diagnostic-sampling data entries are - * already reflected in the sf_raw_sample structure. Subtract - * these bytes. - * 2. The perf raw sample data must be 8-byte aligned (u64) and - * perf's internal data size must be considered too. So add - * an additional u32 for correct alignment and subtract before - * allocating the buffer. - * 3. Store the raw sample buffer pointer in the perf event - * hardware structure. - */ - sfr_size = ALIGN((sizeof(*sfr) - sizeof(sfr->diag) + cpuhw->qsi.dsdes) + - sizeof(u32), sizeof(u64)); - sfr_size -= sizeof(u32); - sfr = kzalloc(sfr_size, GFP_KERNEL); - if (!sfr) - return -ENOMEM; - sfr->size = sfr_size; - sfr->bsdes = cpuhw->qsi.bsdes; - sfr->dsdes = cpuhw->qsi.dsdes; - RAWSAMPLE_REG(hwc) = (unsigned long) sfr; + size_t sample_size; /* Calculate sampling buffers using 4K pages * @@ -420,7 +387,7 @@ static int allocate_buffers(struct cpu_hw_sf *cpuhw, struct hw_perf_event *hwc) * ensure a minimum of CPUM_SF_MIN_SDBT (one table can manage up * to 511 SDBs). */ - sample_size = event_sample_size(hwc); + sample_size = sizeof(struct hws_basic_entry); freq = sample_rate_to_freq(&cpuhw->qsi, SAMPL_RATE(hwc)); factor = 1; n_sdb = DIV_ROUND_UP(freq, factor * ((PAGE_SIZE-64) / sample_size)); @@ -619,10 +586,6 @@ static int reserve_pmc_hardware(void) static void hw_perf_event_destroy(struct perf_event *event) { - /* Free raw sample buffer */ - if (RAWSAMPLE_REG(&event->hw)) - kfree((void *) RAWSAMPLE_REG(&event->hw)); - /* Release PMC if this is the last perf event */ if (!atomic_add_unless(&num_events, -1, 1)) { mutex_lock(&pmc_reserve_mutex); @@ -642,15 +605,8 @@ static void hw_init_period(struct hw_perf_event *hwc, u64 period) static void hw_reset_registers(struct hw_perf_event *hwc, unsigned long *sdbt_origin) { - struct sf_raw_sample *sfr; - /* (Re)set to first sample-data-block-table */ TEAR_REG(hwc) = (unsigned long) sdbt_origin; - - /* (Re)set raw sampling buffer register */ - sfr = (struct sf_raw_sample *) RAWSAMPLE_REG(hwc); - memset(&sfr->basic, 0, sizeof(sfr->basic)); - memset(&sfr->diag, 0, sfr->dsdes); } static unsigned long hw_limit_rate(const struct hws_qsi_info_block *si, @@ -660,6 +616,67 @@ static unsigned long hw_limit_rate(const struct hws_qsi_info_block *si, si->min_sampl_rate, si->max_sampl_rate); } +static u32 cpumsf_pid_type(struct perf_event *event, + u32 pid, enum pid_type type) +{ + struct task_struct *tsk; + + /* Idle process */ + if (!pid) + goto out; + + tsk = find_task_by_pid_ns(pid, &init_pid_ns); + pid = -1; + if (tsk) { + /* + * Only top level events contain the pid namespace in which + * they are created. + */ + if (event->parent) + event = event->parent; + pid = __task_pid_nr_ns(tsk, type, event->ns); + /* + * See also 1d953111b648 + * "perf/core: Don't report zero PIDs for exiting tasks". + */ + if (!pid && !pid_alive(tsk)) + pid = -1; + } +out: + return pid; +} + +static void cpumsf_output_event_pid(struct perf_event *event, + struct perf_sample_data *data, + struct pt_regs *regs) +{ + u32 pid; + struct perf_event_header header; + struct perf_output_handle handle; + + /* + * Obtain the PID from the basic-sampling data entry and + * correct the data->tid_entry.pid value. + */ + pid = data->tid_entry.pid; + + /* Protect callchain buffers, tasks */ + rcu_read_lock(); + + perf_prepare_sample(&header, data, event, regs); + if (perf_output_begin(&handle, event, header.size)) + goto out; + + /* Update the process ID (see also kernel/events/core.c) */ + data->tid_entry.pid = cpumsf_pid_type(event, pid, __PIDTYPE_TGID); + data->tid_entry.tid = cpumsf_pid_type(event, pid, PIDTYPE_PID); + + perf_output_sample(&handle, &header, data, event); + perf_output_end(&handle); +out: + rcu_read_unlock(); +} + static int __hw_perf_event_init(struct perf_event *event) { struct cpu_hw_sf *cpuhw; @@ -770,6 +787,10 @@ static int __hw_perf_event_init(struct perf_event *event) hwc->extra_reg.reg = REG_OVERFLOW; OVERFLOW_REG(hwc) = 0; + /* Use AUX buffer. No need to allocate it by ourself */ + if (attr->config == PERF_EVENT_CPUM_SF_DIAG) + return 0; + /* Allocate the per-CPU sampling buffer using the CPU information * from the event. If the event is not pinned to a particular * CPU (event->cpu == -1; or cpuhw == NULL), allocate sampling @@ -789,6 +810,14 @@ static int __hw_perf_event_init(struct perf_event *event) break; } } + + /* If PID/TID sampling is active, replace the default overflow + * handler to extract and resolve the PIDs from the basic-sampling + * data entries. + */ + if (event->attr.sample_type & PERF_SAMPLE_TID) + if (is_default_overflow_handler(event)) + event->overflow_handler = cpumsf_output_event_pid; out: return err; } @@ -866,10 +895,15 @@ static void cpumsf_pmu_enable(struct pmu *pmu) */ if (cpuhw->event) { hwc = &cpuhw->event->hw; - /* Account number of overflow-designated buffer extents */ - sfb_account_overflows(cpuhw, hwc); - if (sfb_has_pending_allocs(&cpuhw->sfb, hwc)) - extend_sampling_buffer(&cpuhw->sfb, hwc); + if (!(SAMPL_DIAG_MODE(hwc))) { + /* + * Account number of overflow-designated + * buffer extents + */ + sfb_account_overflows(cpuhw, hwc); + if (sfb_has_pending_allocs(&cpuhw->sfb, hwc)) + extend_sampling_buffer(&cpuhw->sfb, hwc); + } } /* (Re)enable the PMU and sampling facility */ @@ -884,6 +918,9 @@ static void cpumsf_pmu_enable(struct pmu *pmu) return; } + /* Load current program parameter */ + lpp(&S390_lowcore.lpp); + debug_sprintf_event(sfdbg, 6, "pmu_enable: es=%i cs=%i ed=%i cd=%i " "tear=%p dear=%p\n", cpuhw->lsctl.es, cpuhw->lsctl.cs, cpuhw->lsctl.ed, cpuhw->lsctl.cd, @@ -967,22 +1004,16 @@ static int perf_exclude_event(struct perf_event *event, struct pt_regs *regs, * * Return non-zero if an event overflow occurred. */ -static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr) +static int perf_push_sample(struct perf_event *event, + struct hws_basic_entry *basic) { int overflow; struct pt_regs regs; struct perf_sf_sde_regs *sde_regs; struct perf_sample_data data; - struct perf_raw_record raw = { - .frag = { - .size = sfr->size, - .data = sfr, - }, - }; /* Setup perf sample */ perf_sample_data_init(&data, 0, event->hw.last_period); - data.raw = &raw; /* Setup pt_regs to look like an CPU-measurement external interrupt * using the Program Request Alert code. The regs.int_parm_long @@ -994,11 +1025,11 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr) regs.int_parm = CPU_MF_INT_SF_PRA; sde_regs = (struct perf_sf_sde_regs *) ®s.int_parm_long; - psw_bits(regs.psw).ia = sfr->basic.ia; - psw_bits(regs.psw).dat = sfr->basic.T; - psw_bits(regs.psw).wait = sfr->basic.W; - psw_bits(regs.psw).pstate = sfr->basic.P; - psw_bits(regs.psw).as = sfr->basic.AS; + psw_bits(regs.psw).ia = basic->ia; + psw_bits(regs.psw).dat = basic->T; + psw_bits(regs.psw).wait = basic->W; + psw_bits(regs.psw).pstate = basic->P; + psw_bits(regs.psw).as = basic->AS; /* * Use the hardware provided configuration level to decide if the @@ -1011,7 +1042,7 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr) * If the value differs from 0xffff (the host value), we assume to * be a KVM guest. */ - switch (sfr->basic.CL) { + switch (basic->CL) { case 1: /* logical partition */ sde_regs->in_guest = 0; break; @@ -1019,11 +1050,17 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr) sde_regs->in_guest = 1; break; default: /* old machine, use heuristics */ - if (sfr->basic.gpp || sfr->basic.prim_asn != 0xffff) + if (basic->gpp || basic->prim_asn != 0xffff) sde_regs->in_guest = 1; break; } + /* + * Store the PID value from the sample-data-entry to be + * processed and resolved by cpumsf_output_event_pid(). + */ + data.tid_entry.pid = basic->hpp & LPP_PID_MASK; + overflow = 0; if (perf_exclude_event(event, ®s, sde_regs)) goto out; @@ -1041,75 +1078,12 @@ static void perf_event_count_update(struct perf_event *event, u64 count) local64_add(count, &event->count); } -static int sample_format_is_valid(struct hws_combined_entry *sample, - unsigned int flags) -{ - if (likely(flags & PERF_CPUM_SF_BASIC_MODE)) - /* Only basic-sampling data entries with data-entry-format - * version of 0x0001 can be processed. - */ - if (sample->basic.def != 0x0001) - return 0; - if (flags & PERF_CPUM_SF_DIAG_MODE) - /* The data-entry-format number of diagnostic-sampling data - * entries can vary. Because diagnostic data is just passed - * through, do only a sanity check on the DEF. - */ - if (sample->diag.def < 0x8001) - return 0; - return 1; -} - -static int sample_is_consistent(struct hws_combined_entry *sample, - unsigned long flags) -{ - /* This check applies only to basic-sampling data entries of potentially - * combined-sampling data entries. Invalid entries cannot be processed - * by the PMU and, thus, do not deliver an associated - * diagnostic-sampling data entry. - */ - if (unlikely(!(flags & PERF_CPUM_SF_BASIC_MODE))) - return 0; - /* - * Samples are skipped, if they are invalid or for which the - * instruction address is not predictable, i.e., the wait-state bit is - * set. - */ - if (sample->basic.I || sample->basic.W) - return 0; - return 1; -} - -static void reset_sample_slot(struct hws_combined_entry *sample, - unsigned long flags) -{ - if (likely(flags & PERF_CPUM_SF_BASIC_MODE)) - sample->basic.def = 0; - if (flags & PERF_CPUM_SF_DIAG_MODE) - sample->diag.def = 0; -} - -static void sfr_store_sample(struct sf_raw_sample *sfr, - struct hws_combined_entry *sample) -{ - if (likely(sfr->format & PERF_CPUM_SF_BASIC_MODE)) - sfr->basic = sample->basic; - if (sfr->format & PERF_CPUM_SF_DIAG_MODE) - memcpy(&sfr->diag, &sample->diag, sfr->dsdes); -} - -static void debug_sample_entry(struct hws_combined_entry *sample, - struct hws_trailer_entry *te, - unsigned long flags) +static void debug_sample_entry(struct hws_basic_entry *sample, + struct hws_trailer_entry *te) { debug_sprintf_event(sfdbg, 4, "hw_collect_samples: Found unknown " - "sampling data entry: te->f=%i basic.def=%04x (%p)" - " diag.def=%04x (%p)\n", te->f, - sample->basic.def, &sample->basic, - (flags & PERF_CPUM_SF_DIAG_MODE) - ? sample->diag.def : 0xFFFF, - (flags & PERF_CPUM_SF_DIAG_MODE) - ? &sample->diag : NULL); + "sampling data entry: te->f=%i basic.def=%04x (%p)\n", + te->f, sample->def, sample); } /* hw_collect_samples() - Walk through a sample-data-block and collect samples @@ -1135,44 +1109,37 @@ static void debug_sample_entry(struct hws_combined_entry *sample, static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, unsigned long long *overflow) { - unsigned long flags = SAMPL_FLAGS(&event->hw); - struct hws_combined_entry *sample; struct hws_trailer_entry *te; - struct sf_raw_sample *sfr; - size_t sample_size; - - /* Prepare and initialize raw sample data */ - sfr = (struct sf_raw_sample *) RAWSAMPLE_REG(&event->hw); - sfr->format = flags & PERF_CPUM_SF_MODE_MASK; + struct hws_basic_entry *sample; - sample_size = event_sample_size(&event->hw); te = (struct hws_trailer_entry *) trailer_entry_ptr(*sdbt); - sample = (struct hws_combined_entry *) *sdbt; + sample = (struct hws_basic_entry *) *sdbt; while ((unsigned long *) sample < (unsigned long *) te) { /* Check for an empty sample */ - if (!sample->basic.def) + if (!sample->def) break; /* Update perf event period */ perf_event_count_update(event, SAMPL_RATE(&event->hw)); - /* Check sampling data entry */ - if (sample_format_is_valid(sample, flags)) { + /* Check whether sample is valid */ + if (sample->def == 0x0001) { /* If an event overflow occurred, the PMU is stopped to * throttle event delivery. Remaining sample data is * discarded. */ if (!*overflow) { - if (sample_is_consistent(sample, flags)) { + /* Check whether sample is consistent */ + if (sample->I == 0 && sample->W == 0) { /* Deliver sample data to perf */ - sfr_store_sample(sfr, sample); - *overflow = perf_push_sample(event, sfr); + *overflow = perf_push_sample(event, + sample); } } else /* Count discarded samples */ *overflow += 1; } else { - debug_sample_entry(sample, te, flags); + debug_sample_entry(sample, te); /* Sample slot is not yet written or other record. * * This condition can occur if the buffer was reused @@ -1188,8 +1155,8 @@ static void hw_collect_samples(struct perf_event *event, unsigned long *sdbt, } /* Reset sample slot and advance to next sample */ - reset_sample_slot(sample, flags); - sample += sample_size; + sample->def = 0; + sample++; } } @@ -1215,6 +1182,13 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) unsigned long long event_overflow, sampl_overflow, num_sdb, te_flags; int done; + /* + * AUX buffer is used when in diagnostic sampling mode. + * No perf events/samples are created. + */ + if (SAMPL_DIAG_MODE(&event->hw)) + return; + if (flush_all && SDB_FULL_BLOCKS(hwc)) flush_all = 0; @@ -1291,6 +1265,439 @@ static void hw_perf_event_update(struct perf_event *event, int flush_all) sampl_overflow, event_overflow); } +#define AUX_SDB_INDEX(aux, i) ((i) % aux->sfb.num_sdb) +#define AUX_SDB_NUM(aux, start, end) (end >= start ? end - start + 1 : 0) +#define AUX_SDB_NUM_ALERT(aux) AUX_SDB_NUM(aux, aux->head, aux->alert_mark) +#define AUX_SDB_NUM_EMPTY(aux) AUX_SDB_NUM(aux, aux->head, aux->empty_mark) + +/* + * Get trailer entry by index of SDB. + */ +static struct hws_trailer_entry *aux_sdb_trailer(struct aux_buffer *aux, + unsigned long index) +{ + unsigned long sdb; + + index = AUX_SDB_INDEX(aux, index); + sdb = aux->sdb_index[index]; + return (struct hws_trailer_entry *)trailer_entry_ptr(sdb); +} + +/* + * Finish sampling on the cpu. Called by cpumsf_pmu_del() with pmu + * disabled. Collect the full SDBs in AUX buffer which have not reached + * the point of alert indicator. And ignore the SDBs which are not + * full. + * + * 1. Scan SDBs to see how much data is there and consume them. + * 2. Remove alert indicator in the buffer. + */ +static void aux_output_end(struct perf_output_handle *handle) +{ + unsigned long i, range_scan, idx; + struct aux_buffer *aux; + struct hws_trailer_entry *te; + + aux = perf_get_aux(handle); + if (!aux) + return; + + range_scan = AUX_SDB_NUM_ALERT(aux); + for (i = 0, idx = aux->head; i < range_scan; i++, idx++) { + te = aux_sdb_trailer(aux, idx); + if (!(te->flags & SDB_TE_BUFFER_FULL_MASK)) + break; + } + /* i is num of SDBs which are full */ + perf_aux_output_end(handle, i << PAGE_SHIFT); + + /* Remove alert indicators in the buffer */ + te = aux_sdb_trailer(aux, aux->alert_mark); + te->flags &= ~SDB_TE_ALERT_REQ_MASK; + + debug_sprintf_event(sfdbg, 6, "aux_output_end: collect %lx SDBs\n", i); +} + +/* + * Start sampling on the CPU. Called by cpumsf_pmu_add() when an event + * is first added to the CPU or rescheduled again to the CPU. It is called + * with pmu disabled. + * + * 1. Reset the trailer of SDBs to get ready for new data. + * 2. Tell the hardware where to put the data by reset the SDBs buffer + * head(tear/dear). + */ +static int aux_output_begin(struct perf_output_handle *handle, + struct aux_buffer *aux, + struct cpu_hw_sf *cpuhw) +{ + unsigned long range; + unsigned long i, range_scan, idx; + unsigned long head, base, offset; + struct hws_trailer_entry *te; + + if (WARN_ON_ONCE(handle->head & ~PAGE_MASK)) + return -EINVAL; + + aux->head = handle->head >> PAGE_SHIFT; + range = (handle->size + 1) >> PAGE_SHIFT; + if (range <= 1) + return -ENOMEM; + + /* + * SDBs between aux->head and aux->empty_mark are already ready + * for new data. range_scan is num of SDBs not within them. + */ + if (range > AUX_SDB_NUM_EMPTY(aux)) { + range_scan = range - AUX_SDB_NUM_EMPTY(aux); + idx = aux->empty_mark + 1; + for (i = 0; i < range_scan; i++, idx++) { + te = aux_sdb_trailer(aux, idx); + te->flags = te->flags & ~SDB_TE_BUFFER_FULL_MASK; + te->flags = te->flags & ~SDB_TE_ALERT_REQ_MASK; + te->overflow = 0; + } + /* Save the position of empty SDBs */ + aux->empty_mark = aux->head + range - 1; + } + + /* Set alert indicator */ + aux->alert_mark = aux->head + range/2 - 1; + te = aux_sdb_trailer(aux, aux->alert_mark); + te->flags = te->flags | SDB_TE_ALERT_REQ_MASK; + + /* Reset hardware buffer head */ + head = AUX_SDB_INDEX(aux, aux->head); + base = aux->sdbt_index[head / CPUM_SF_SDB_PER_TABLE]; + offset = head % CPUM_SF_SDB_PER_TABLE; + cpuhw->lsctl.tear = base + offset * sizeof(unsigned long); + cpuhw->lsctl.dear = aux->sdb_index[head]; + + debug_sprintf_event(sfdbg, 6, "aux_output_begin: " + "head->alert_mark->empty_mark (num_alert, range)" + "[%lx -> %lx -> %lx] (%lx, %lx) " + "tear index %lx, tear %lx dear %lx\n", + aux->head, aux->alert_mark, aux->empty_mark, + AUX_SDB_NUM_ALERT(aux), range, + head / CPUM_SF_SDB_PER_TABLE, + cpuhw->lsctl.tear, + cpuhw->lsctl.dear); + + return 0; +} + +/* + * Set alert indicator on SDB at index @alert_index while sampler is running. + * + * Return true if successfully. + * Return false if full indicator is already set by hardware sampler. + */ +static bool aux_set_alert(struct aux_buffer *aux, unsigned long alert_index, + unsigned long long *overflow) +{ + unsigned long long orig_overflow, orig_flags, new_flags; + struct hws_trailer_entry *te; + + te = aux_sdb_trailer(aux, alert_index); + do { + orig_flags = te->flags; + orig_overflow = te->overflow; + *overflow = orig_overflow; + if (orig_flags & SDB_TE_BUFFER_FULL_MASK) { + /* + * SDB is already set by hardware. + * Abort and try to set somewhere + * behind. + */ + return false; + } + new_flags = orig_flags | SDB_TE_ALERT_REQ_MASK; + } while (!cmpxchg_double(&te->flags, &te->overflow, + orig_flags, orig_overflow, + new_flags, 0ULL)); + return true; +} + +/* + * aux_reset_buffer() - Scan and setup SDBs for new samples + * @aux: The AUX buffer to set + * @range: The range of SDBs to scan started from aux->head + * @overflow: Set to overflow count + * + * Set alert indicator on the SDB at index of aux->alert_mark. If this SDB is + * marked as empty, check if it is already set full by the hardware sampler. + * If yes, that means new data is already there before we can set an alert + * indicator. Caller should try to set alert indicator to some position behind. + * + * Scan the SDBs in AUX buffer from behind aux->empty_mark. They are used + * previously and have already been consumed by user space. Reset these SDBs + * (clear full indicator and alert indicator) for new data. + * If aux->alert_mark fall in this area, just set it. Overflow count is + * recorded while scanning. + * + * SDBs between aux->head and aux->empty_mark are already reset at last time. + * and ready for new samples. So scanning on this area could be skipped. + * + * Return true if alert indicator is set successfully and false if not. + */ +static bool aux_reset_buffer(struct aux_buffer *aux, unsigned long range, + unsigned long long *overflow) +{ + unsigned long long orig_overflow, orig_flags, new_flags; + unsigned long i, range_scan, idx; + struct hws_trailer_entry *te; + + if (range <= AUX_SDB_NUM_EMPTY(aux)) + /* + * No need to scan. All SDBs in range are marked as empty. + * Just set alert indicator. Should check race with hardware + * sampler. + */ + return aux_set_alert(aux, aux->alert_mark, overflow); + + if (aux->alert_mark <= aux->empty_mark) + /* + * Set alert indicator on empty SDB. Should check race + * with hardware sampler. + */ + if (!aux_set_alert(aux, aux->alert_mark, overflow)) + return false; + + /* + * Scan the SDBs to clear full and alert indicator used previously. + * Start scanning from one SDB behind empty_mark. If the new alert + * indicator fall into this range, set it. + */ + range_scan = range - AUX_SDB_NUM_EMPTY(aux); + idx = aux->empty_mark + 1; + for (i = 0; i < range_scan; i++, idx++) { + te = aux_sdb_trailer(aux, idx); + do { + orig_flags = te->flags; + orig_overflow = te->overflow; + new_flags = orig_flags & ~SDB_TE_BUFFER_FULL_MASK; + if (idx == aux->alert_mark) + new_flags |= SDB_TE_ALERT_REQ_MASK; + else + new_flags &= ~SDB_TE_ALERT_REQ_MASK; + } while (!cmpxchg_double(&te->flags, &te->overflow, + orig_flags, orig_overflow, + new_flags, 0ULL)); + *overflow += orig_overflow; + } + + /* Update empty_mark to new position */ + aux->empty_mark = aux->head + range - 1; + + return true; +} + +/* + * Measurement alert handler for diagnostic mode sampling. + */ +static void hw_collect_aux(struct cpu_hw_sf *cpuhw) +{ + struct aux_buffer *aux; + int done = 0; + unsigned long range = 0, size; + unsigned long long overflow = 0; + struct perf_output_handle *handle = &cpuhw->handle; + unsigned long num_sdb; + + aux = perf_get_aux(handle); + if (WARN_ON_ONCE(!aux)) + return; + + /* Inform user space new data arrived */ + size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT; + perf_aux_output_end(handle, size); + num_sdb = aux->sfb.num_sdb; + + while (!done) { + /* Get an output handle */ + aux = perf_aux_output_begin(handle, cpuhw->event); + if (handle->size == 0) { + pr_err("The AUX buffer with %lu pages for the " + "diagnostic-sampling mode is full\n", + num_sdb); + debug_sprintf_event(sfdbg, 1, "AUX buffer used up\n"); + break; + } + if (WARN_ON_ONCE(!aux)) + return; + + /* Update head and alert_mark to new position */ + aux->head = handle->head >> PAGE_SHIFT; + range = (handle->size + 1) >> PAGE_SHIFT; + if (range == 1) + aux->alert_mark = aux->head; + else + aux->alert_mark = aux->head + range/2 - 1; + + if (aux_reset_buffer(aux, range, &overflow)) { + if (!overflow) { + done = 1; + break; + } + size = range << PAGE_SHIFT; + perf_aux_output_end(&cpuhw->handle, size); + pr_err("Sample data caused the AUX buffer with %lu " + "pages to overflow\n", num_sdb); + debug_sprintf_event(sfdbg, 1, "head %lx range %lx " + "overflow %llx\n", + aux->head, range, overflow); + } else { + size = AUX_SDB_NUM_ALERT(aux) << PAGE_SHIFT; + perf_aux_output_end(&cpuhw->handle, size); + debug_sprintf_event(sfdbg, 6, "head %lx alert %lx " + "already full, try another\n", + aux->head, aux->alert_mark); + } + } + + if (done) + debug_sprintf_event(sfdbg, 6, "aux_reset_buffer: " + "[%lx -> %lx -> %lx] (%lx, %lx)\n", + aux->head, aux->alert_mark, aux->empty_mark, + AUX_SDB_NUM_ALERT(aux), range); +} + +/* + * Callback when freeing AUX buffers. + */ +static void aux_buffer_free(void *data) +{ + struct aux_buffer *aux = data; + unsigned long i, num_sdbt; + + if (!aux) + return; + + /* Free SDBT. SDB is freed by the caller */ + num_sdbt = aux->sfb.num_sdbt; + for (i = 0; i < num_sdbt; i++) + free_page(aux->sdbt_index[i]); + + kfree(aux->sdbt_index); + kfree(aux->sdb_index); + kfree(aux); + + debug_sprintf_event(sfdbg, 4, "aux_buffer_free: free " + "%lu SDBTs\n", num_sdbt); +} + +/* + * aux_buffer_setup() - Setup AUX buffer for diagnostic mode sampling + * @cpu: On which to allocate, -1 means current + * @pages: Array of pointers to buffer pages passed from perf core + * @nr_pages: Total pages + * @snapshot: Flag for snapshot mode + * + * This is the callback when setup an event using AUX buffer. Perf tool can + * trigger this by an additional mmap() call on the event. Unlike the buffer + * for basic samples, AUX buffer belongs to the event. It is scheduled with + * the task among online cpus when it is a per-thread event. + * + * Return the private AUX buffer structure if success or NULL if fails. + */ +static void *aux_buffer_setup(int cpu, void **pages, int nr_pages, + bool snapshot) +{ + struct sf_buffer *sfb; + struct aux_buffer *aux; + unsigned long *new, *tail; + int i, n_sdbt; + + if (!nr_pages || !pages) + return NULL; + + if (nr_pages > CPUM_SF_MAX_SDB * CPUM_SF_SDB_DIAG_FACTOR) { + pr_err("AUX buffer size (%i pages) is larger than the " + "maximum sampling buffer limit\n", + nr_pages); + return NULL; + } else if (nr_pages < CPUM_SF_MIN_SDB * CPUM_SF_SDB_DIAG_FACTOR) { + pr_err("AUX buffer size (%i pages) is less than the " + "minimum sampling buffer limit\n", + nr_pages); + return NULL; + } + + /* Allocate aux_buffer struct for the event */ + aux = kmalloc(sizeof(struct aux_buffer), GFP_KERNEL); + if (!aux) + goto no_aux; + sfb = &aux->sfb; + + /* Allocate sdbt_index for fast reference */ + n_sdbt = (nr_pages + CPUM_SF_SDB_PER_TABLE - 1) / CPUM_SF_SDB_PER_TABLE; + aux->sdbt_index = kmalloc_array(n_sdbt, sizeof(void *), GFP_KERNEL); + if (!aux->sdbt_index) + goto no_sdbt_index; + + /* Allocate sdb_index for fast reference */ + aux->sdb_index = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL); + if (!aux->sdb_index) + goto no_sdb_index; + + /* Allocate the first SDBT */ + sfb->num_sdbt = 0; + sfb->sdbt = (unsigned long *) get_zeroed_page(GFP_KERNEL); + if (!sfb->sdbt) + goto no_sdbt; + aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)sfb->sdbt; + tail = sfb->tail = sfb->sdbt; + + /* + * Link the provided pages of AUX buffer to SDBT. + * Allocate SDBT if needed. + */ + for (i = 0; i < nr_pages; i++, tail++) { + if (require_table_link(tail)) { + new = (unsigned long *) get_zeroed_page(GFP_KERNEL); + if (!new) + goto no_sdbt; + aux->sdbt_index[sfb->num_sdbt++] = (unsigned long)new; + /* Link current page to tail of chain */ + *tail = (unsigned long)(void *) new + 1; + tail = new; + } + /* Tail is the entry in a SDBT */ + *tail = (unsigned long)pages[i]; + aux->sdb_index[i] = (unsigned long)pages[i]; + } + sfb->num_sdb = nr_pages; + + /* Link the last entry in the SDBT to the first SDBT */ + *tail = (unsigned long) sfb->sdbt + 1; + sfb->tail = tail; + + /* + * Initial all SDBs are zeroed. Mark it as empty. + * So there is no need to clear the full indicator + * when this event is first added. + */ + aux->empty_mark = sfb->num_sdb - 1; + + debug_sprintf_event(sfdbg, 4, "aux_buffer_setup: setup %lu SDBTs" + " and %lu SDBs\n", + sfb->num_sdbt, sfb->num_sdb); + + return aux; + +no_sdbt: + /* SDBs (AUX buffer pages) are freed by caller */ + for (i = 0; i < sfb->num_sdbt; i++) + free_page(aux->sdbt_index[i]); + kfree(aux->sdb_index); +no_sdb_index: + kfree(aux->sdbt_index); +no_sdbt_index: + kfree(aux); +no_aux: + return NULL; +} + static void cpumsf_pmu_read(struct perf_event *event) { /* Nothing to do ... updates are interrupt-driven */ @@ -1342,12 +1749,13 @@ static void cpumsf_pmu_stop(struct perf_event *event, int flags) static int cpumsf_pmu_add(struct perf_event *event, int flags) { struct cpu_hw_sf *cpuhw = this_cpu_ptr(&cpu_hw_sf); + struct aux_buffer *aux; int err; if (cpuhw->flags & PMU_F_IN_USE) return -EAGAIN; - if (!cpuhw->sfb.sdbt) + if (!SAMPL_DIAG_MODE(&event->hw) && !cpuhw->sfb.sdbt) return -EINVAL; err = 0; @@ -1362,10 +1770,12 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags) */ cpuhw->lsctl.s = 0; cpuhw->lsctl.h = 1; - cpuhw->lsctl.tear = (unsigned long) cpuhw->sfb.sdbt; - cpuhw->lsctl.dear = *(unsigned long *) cpuhw->sfb.sdbt; cpuhw->lsctl.interval = SAMPL_RATE(&event->hw); - hw_reset_registers(&event->hw, cpuhw->sfb.sdbt); + if (!SAMPL_DIAG_MODE(&event->hw)) { + cpuhw->lsctl.tear = (unsigned long) cpuhw->sfb.sdbt; + cpuhw->lsctl.dear = *(unsigned long *) cpuhw->sfb.sdbt; + hw_reset_registers(&event->hw, cpuhw->sfb.sdbt); + } /* Ensure sampling functions are in the disabled state. If disabled, * switch on sampling enable control. */ @@ -1373,9 +1783,18 @@ static int cpumsf_pmu_add(struct perf_event *event, int flags) err = -EAGAIN; goto out; } - cpuhw->lsctl.es = 1; - if (SAMPL_DIAG_MODE(&event->hw)) + if (SAMPL_DIAG_MODE(&event->hw)) { + aux = perf_aux_output_begin(&cpuhw->handle, event); + if (!aux) { + err = -EINVAL; + goto out; + } + err = aux_output_begin(&cpuhw->handle, aux, cpuhw); + if (err) + goto out; cpuhw->lsctl.ed = 1; + } + cpuhw->lsctl.es = 1; /* Set in_use flag and store event */ cpuhw->event = event; @@ -1401,6 +1820,8 @@ static void cpumsf_pmu_del(struct perf_event *event, int flags) cpuhw->flags &= ~PMU_F_IN_USE; cpuhw->event = NULL; + if (SAMPL_DIAG_MODE(&event->hw)) + aux_output_end(&cpuhw->handle); perf_event_update_userpage(event); perf_pmu_enable(event->pmu); } @@ -1448,6 +1869,9 @@ static struct pmu cpumf_sampling = { .read = cpumsf_pmu_read, .attr_groups = cpumsf_pmu_attr_groups, + + .setup_aux = aux_buffer_setup, + .free_aux = aux_buffer_free, }; static void cpumf_measurement_alert(struct ext_code ext_code, @@ -1471,7 +1895,10 @@ static void cpumf_measurement_alert(struct ext_code ext_code, /* Program alert request */ if (alert & CPU_MF_INT_SF_PRA) { if (cpuhw->flags & PMU_F_IN_USE) - hw_perf_event_update(cpuhw->event, 0); + if (SAMPL_DIAG_MODE(&cpuhw->event->hw)) + hw_collect_aux(cpuhw); + else + hw_perf_event_update(cpuhw->event, 0); else WARN_ON_ONCE(!(cpuhw->flags & PMU_F_IN_USE)); } @@ -1590,6 +2017,9 @@ static int __init init_cpum_sampling_pmu(void) return -ENODEV; } + if (!si.as && !si.ad) + return -ENODEV; + if (si.bsdes != sizeof(struct hws_basic_entry)) { pr_cpumsf_err(RS_INIT_FAILURE_BSDES); return -EINVAL; diff --git a/arch/s390/kernel/perf_regs.c b/arch/s390/kernel/perf_regs.c new file mode 100644 index 0000000..f8603eb --- /dev/null +++ b/arch/s390/kernel/perf_regs.c @@ -0,0 +1,70 @@ +#include <linux/perf_event.h> +#include <linux/perf_regs.h> +#include <linux/kernel.h> +#include <linux/errno.h> +#include <linux/bug.h> +#include <asm/ptrace.h> +#include <asm/fpu/api.h> +#include <asm/fpu/types.h> + +u64 perf_reg_value(struct pt_regs *regs, int idx) +{ + freg_t fp; + + if (WARN_ON_ONCE((u32)idx >= PERF_REG_S390_MAX)) + return 0; + + if (idx >= PERF_REG_S390_R0 && idx <= PERF_REG_S390_R15) + return regs->gprs[idx]; + + if (idx >= PERF_REG_S390_FP0 && idx <= PERF_REG_S390_FP15) { + if (!user_mode(regs)) + return 0; + + idx -= PERF_REG_S390_FP0; + fp = MACHINE_HAS_VX ? *(freg_t *)(current->thread.fpu.vxrs + idx) + : current->thread.fpu.fprs[idx]; + return fp.ui; + } + + if (idx == PERF_REG_S390_MASK) + return regs->psw.mask; + if (idx == PERF_REG_S390_PC) + return regs->psw.addr; + + return regs->gprs[idx]; +} + +#define REG_RESERVED (~((1UL << PERF_REG_S390_MAX) - 1)) + +int perf_reg_validate(u64 mask) +{ + if (!mask || mask & REG_RESERVED) + return -EINVAL; + + return 0; +} + +u64 perf_reg_abi(struct task_struct *task) +{ + if (test_tsk_thread_flag(task, TIF_31BIT)) + return PERF_SAMPLE_REGS_ABI_32; + + return PERF_SAMPLE_REGS_ABI_64; +} + +void perf_get_regs_user(struct perf_regs *regs_user, + struct pt_regs *regs, + struct pt_regs *regs_user_copy) +{ + /* + * Use the regs from the first interruption and let + * perf_sample_regs_intr() handle interrupts (regs == get_irq_regs()). + * + * Also save FPU registers for user-space tasks only. + */ + regs_user->regs = task_pt_regs(current); + if (user_mode(regs_user->regs)) + save_fpu_regs(); + regs_user->abi = perf_reg_abi(current); +} diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 0520854..39a2187 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -158,16 +158,9 @@ int vdso_alloc_per_cpu(struct lowcore *lowcore) { unsigned long segment_table, page_table, page_frame; struct vdso_per_cpu_data *vd; - u32 *psal, *aste; - int i; - - lowcore->vdso_per_cpu_data = __LC_PASTE; - - if (!vdso_enabled) - return 0; segment_table = __get_free_pages(GFP_KERNEL, SEGMENT_ORDER); - page_table = get_zeroed_page(GFP_KERNEL | GFP_DMA); + page_table = get_zeroed_page(GFP_KERNEL); page_frame = get_zeroed_page(GFP_KERNEL); if (!segment_table || !page_table || !page_frame) goto out; @@ -179,25 +172,15 @@ int vdso_alloc_per_cpu(struct lowcore *lowcore) vd->cpu_nr = lowcore->cpu_nr; vd->node_id = cpu_to_node(vd->cpu_nr); - /* Set up access register mode page table */ + /* Set up page table for the vdso address space */ memset64((u64 *)segment_table, _SEGMENT_ENTRY_EMPTY, _CRST_ENTRIES); memset64((u64 *)page_table, _PAGE_INVALID, PTRS_PER_PTE); *(unsigned long *) segment_table = _SEGMENT_ENTRY + page_table; *(unsigned long *) page_table = _PAGE_PROTECT + page_frame; - psal = (u32 *) (page_table + 256*sizeof(unsigned long)); - aste = psal + 32; - - for (i = 4; i < 32; i += 4) - psal[i] = 0x80000000; - - lowcore->paste[4] = (u32)(addr_t) psal; - psal[0] = 0x02000000; - psal[2] = (u32)(addr_t) aste; - *(unsigned long *) (aste + 2) = segment_table + + lowcore->vdso_asce = segment_table + _ASCE_TABLE_LENGTH + _ASCE_USER_BITS + _ASCE_TYPE_SEGMENT; - aste[4] = (u32)(addr_t) psal; lowcore->vdso_per_cpu_data = page_frame; return 0; @@ -212,14 +195,8 @@ out: void vdso_free_per_cpu(struct lowcore *lowcore) { unsigned long segment_table, page_table, page_frame; - u32 *psal, *aste; - - if (!vdso_enabled) - return; - psal = (u32 *)(addr_t) lowcore->paste[4]; - aste = (u32 *)(addr_t) psal[2]; - segment_table = *(unsigned long *)(aste + 2) & PAGE_MASK; + segment_table = lowcore->vdso_asce & PAGE_MASK; page_table = *(unsigned long *) segment_table; page_frame = *(unsigned long *) page_table; @@ -228,16 +205,6 @@ void vdso_free_per_cpu(struct lowcore *lowcore) free_pages(segment_table, SEGMENT_ORDER); } -static void vdso_init_cr5(void) -{ - unsigned long cr5; - - if (!vdso_enabled) - return; - cr5 = offsetof(struct lowcore, paste); - __ctl_load(cr5, 5, 5); -} - /* * This is called from binfmt_elf, we create the special vma for the * vDSO and insert it into the mm struct tree @@ -314,8 +281,6 @@ static int __init vdso_init(void) { int i; - if (!vdso_enabled) - return 0; vdso_init_data(vdso_data); #ifdef CONFIG_COMPAT /* Calculate the size of the 32 bit vDSO */ @@ -354,7 +319,6 @@ static int __init vdso_init(void) vdso64_pagelist[vdso64_pages] = NULL; if (vdso_alloc_per_cpu(&S390_lowcore)) BUG(); - vdso_init_cr5(); get_page(virt_to_page(vdso_data)); diff --git a/arch/s390/kernel/vdso32/getcpu.S b/arch/s390/kernel/vdso32/getcpu.S index 6e30769..5477a2c 100644 --- a/arch/s390/kernel/vdso32/getcpu.S +++ b/arch/s390/kernel/vdso32/getcpu.S @@ -15,23 +15,11 @@ .type __kernel_getcpu,@function __kernel_getcpu: .cfi_startproc - ear %r1,%a4 - lhi %r4,1 - sll %r4,24 - sar %a4,%r4 la %r4,0 - epsw %r0,0 - sacf 512 + sacf 256 l %r5,__VDSO_CPU_NR(%r4) l %r4,__VDSO_NODE_ID(%r4) - tml %r0,0x4000 - jo 1f - tml %r0,0x8000 - jno 0f - sacf 256 - j 1f -0: sacf 0 -1: sar %a4,%r1 + sacf 0 ltr %r2,%r2 jz 2f st %r5,0(%r2) diff --git a/arch/s390/kernel/vdso64/clock_gettime.S b/arch/s390/kernel/vdso64/clock_gettime.S index 9c3b126..5d7b56b 100644 --- a/arch/s390/kernel/vdso64/clock_gettime.S +++ b/arch/s390/kernel/vdso64/clock_gettime.S @@ -114,23 +114,12 @@ __kernel_clock_gettime: br %r14 /* CPUCLOCK_VIRT for this thread */ -9: icm %r0,15,__VDSO_ECTG_OK(%r5) +9: lghi %r4,0 + icm %r0,15,__VDSO_ECTG_OK(%r5) jz 12f - ear %r2,%a4 - llilh %r4,0x0100 - sar %a4,%r4 - lghi %r4,0 - epsw %r5,0 - sacf 512 /* Magic ectg instruction */ + sacf 256 /* Magic ectg instruction */ .insn ssf,0xc80100000000,__VDSO_ECTG_BASE(4),__VDSO_ECTG_USER(4),4 - tml %r5,0x4000 - jo 11f - tml %r5,0x8000 - jno 10f - sacf 256 - j 11f -10: sacf 0 -11: sar %a4,%r2 + sacf 0 algr %r1,%r0 /* r1 = cputime as TOD value */ mghi %r1,1000 /* convert to nanoseconds */ srlg %r1,%r1,12 /* r1 = cputime in nanosec */ diff --git a/arch/s390/kernel/vdso64/getcpu.S b/arch/s390/kernel/vdso64/getcpu.S index 4398376..e9c3436 100644 --- a/arch/s390/kernel/vdso64/getcpu.S +++ b/arch/s390/kernel/vdso64/getcpu.S @@ -15,22 +15,11 @@ .type __kernel_getcpu,@function __kernel_getcpu: .cfi_startproc - ear %r1,%a4 - llilh %r4,0x0100 - sar %a4,%r4 la %r4,0 - epsw %r0,0 - sacf 512 + sacf 256 l %r5,__VDSO_CPU_NR(%r4) l %r4,__VDSO_NODE_ID(%r4) - tml %r0,0x4000 - jo 1f - tml %r0,0x8000 - jno 0f - sacf 256 - j 1f -0: sacf 0 -1: sar %a4,%r1 + sacf 0 ltgr %r2,%r2 jz 2f st %r5,0(%r2) |