From 72c27d71d82569aec187c30f6ff208631abc02f4 Mon Sep 17 00:00:00 2001 From: jkoshy Date: Fri, 7 Dec 2007 08:20:17 +0000 Subject: Kernel and hwpmc(4) support for callchain capture. Sponsored by: FreeBSD Foundation and Google Inc. --- sys/dev/hwpmc/hwpmc_amd.c | 28 ++-- sys/dev/hwpmc/hwpmc_logging.c | 53 ++++--- sys/dev/hwpmc/hwpmc_mod.c | 347 ++++++++++++++++++++++++++++++++++-------- sys/dev/hwpmc/hwpmc_piv.c | 30 ++-- sys/dev/hwpmc/hwpmc_ppro.c | 10 +- sys/dev/hwpmc/hwpmc_x86.c | 191 +++++++++++++++++++++++ 6 files changed, 547 insertions(+), 112 deletions(-) (limited to 'sys/dev/hwpmc') diff --git a/sys/dev/hwpmc/hwpmc_amd.c b/sys/dev/hwpmc/hwpmc_amd.c index b7cdee8..3576234 100644 --- a/sys/dev/hwpmc/hwpmc_amd.c +++ b/sys/dev/hwpmc/hwpmc_amd.c @@ -1,7 +1,11 @@ /*- - * Copyright (c) 2003-2005 Joseph Koshy + * Copyright (c) 2003-2007 Joseph Koshy + * Copyright (c) 2007 The FreeBSD Foundation * All rights reserved. * + * Portions of this software were developed by A. Joseph Koshy under + * sponsorship from the FreeBSD Foundation and Google, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -38,9 +42,9 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include -#include #include #ifdef DEBUG @@ -667,7 +671,7 @@ amd_stop_pmc(int cpu, int ri) */ static int -amd_intr(int cpu, uintptr_t eip, int usermode) +amd_intr(int cpu, struct trapframe *tf) { int i, error, retval, ri; uint32_t config, evsel, perfctr; @@ -679,8 +683,8 @@ amd_intr(int cpu, uintptr_t eip, int usermode) KASSERT(cpu >= 0 && cpu < mp_ncpus, ("[amd,%d] out of range CPU %d", __LINE__, cpu)); - PMCDBG(MDP,INT,1, "cpu=%d eip=%p um=%d", cpu, (void *) eip, - usermode); + PMCDBG(MDP,INT,1, "cpu=%d tf=0x%p um=%d", cpu, (void *) tf, + TRAPF_USERMODE(tf)); retval = 0; @@ -696,8 +700,8 @@ amd_intr(int cpu, uintptr_t eip, int usermode) * * If multiple PMCs interrupt at the same time, the AMD64 * processor appears to deliver as many NMIs as there are - * outstanding PMC interrupts. Thus we need to only process - * one interrupt at a time. + * outstanding PMC interrupts. So we process only one NMI + * interrupt at a time. */ for (i = 0; retval == 0 && i < AMD_NPMCS-1; i++) { @@ -717,9 +721,9 @@ amd_intr(int cpu, uintptr_t eip, int usermode) continue; } - retval = 1; /* found an interrupting PMC */ + retval = 1; /* Found an interrupting PMC. */ - /* stop the PMC, reload count */ + /* Stop the PMC, reload count. */ evsel = AMD_PMC_EVSEL_0 + i; perfctr = AMD_PMC_PERFCTR_0 + i; v = pm->pm_sc.pm_reloadcount; @@ -733,8 +737,8 @@ amd_intr(int cpu, uintptr_t eip, int usermode) wrmsr(evsel, config & ~AMD_PMC_ENABLE); wrmsr(perfctr, AMD_RELOAD_COUNT_TO_PERFCTR_VALUE(v)); - /* restart the counter if there was no error during logging */ - error = pmc_process_interrupt(cpu, pm, eip, usermode); + /* Restart the counter if logging succeeded. */ + error = pmc_process_interrupt(cpu, pm, tf, TRAPF_USERMODE(tf)); if (error == 0) wrmsr(evsel, config | AMD_PMC_ENABLE); } @@ -742,7 +746,7 @@ amd_intr(int cpu, uintptr_t eip, int usermode) atomic_add_int(retval ? &pmc_stats.pm_intr_processed : &pmc_stats.pm_intr_ignored, 1); - return retval; + return (retval); } /* diff --git a/sys/dev/hwpmc/hwpmc_logging.c b/sys/dev/hwpmc/hwpmc_logging.c index 7be4776..77417af 100644 --- a/sys/dev/hwpmc/hwpmc_logging.c +++ b/sys/dev/hwpmc/hwpmc_logging.c @@ -1,7 +1,11 @@ /*- - * Copyright (c) 2005 Joseph Koshy + * Copyright (c) 2005-2007 Joseph Koshy + * Copyright (c) 2007 The FreeBSD Foundation * All rights reserved. * + * Portions of this software were developed by A. Joseph Koshy under + * sponsorship from the FreeBSD Foundation and Google, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -64,7 +68,6 @@ TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "logbuffersize", &pmclog_buffer_size); SYSCTL_INT(_kern_hwpmc, OID_AUTO, logbuffersize, CTLFLAG_TUN|CTLFLAG_RD, &pmclog_buffer_size, 0, "size of log buffers in kilobytes"); - /* * kern.hwpmc.nbuffer -- number of global log buffers */ @@ -96,7 +99,6 @@ static struct mtx pmc_kthread_mtx; /* sleep lock */ /* * Log file record constructors. */ - #define _PMCLOG_TO_HEADER(T,L) \ ((PMCLOG_HEADER_MAGIC << 24) | \ (PMCLOG_TYPE_ ## T << 16) | \ @@ -135,6 +137,8 @@ static struct mtx pmc_kthread_mtx; /* sleep lock */ * Assertions about the log file format. */ +CTASSERT(sizeof(struct pmclog_callchain) == 6*4 + + PMC_CALLCHAIN_DEPTH_MAX*sizeof(uintfptr_t)); CTASSERT(sizeof(struct pmclog_closelog) == 3*4); CTASSERT(sizeof(struct pmclog_dropnotify) == 3*4); CTASSERT(sizeof(struct pmclog_map_in) == PATH_MAX + @@ -710,9 +714,28 @@ pmclog_flush(struct pmc_owner *po) } -/* - * Send a 'close log' event to the log file. - */ +void +pmclog_process_callchain(struct pmc *pm, struct pmc_sample *ps) +{ + int n, recordlen; + uint32_t flags; + struct pmc_owner *po; + + PMCDBG(LOG,SAM,1,"pm=%p pid=%d n=%d", pm, ps->ps_pid, + ps->ps_nsamples); + + recordlen = offsetof(struct pmclog_callchain, pl_pc) + + ps->ps_nsamples * sizeof(uintfptr_t); + po = pm->pm_owner; + flags = PMC_CALLCHAIN_TO_CPUFLAGS(ps->ps_cpu,ps->ps_flags); + PMCLOG_RESERVE(po, CALLCHAIN, recordlen); + PMCLOG_EMIT32(ps->ps_pid); + PMCLOG_EMIT32(pm->pm_id); + PMCLOG_EMIT32(flags); + for (n = 0; n < ps->ps_nsamples; n++) + PMCLOG_EMITADDR(ps->ps_pc[n]); + PMCLOG_DESPATCH(po); +} void pmclog_process_closelog(struct pmc_owner *po) @@ -761,24 +784,6 @@ pmclog_process_map_out(struct pmc_owner *po, pid_t pid, uintfptr_t start, } void -pmclog_process_pcsample(struct pmc *pm, struct pmc_sample *ps) -{ - struct pmc_owner *po; - - PMCDBG(LOG,SAM,1,"pm=%p pid=%d pc=%p", pm, ps->ps_pid, - (void *) ps->ps_pc); - - po = pm->pm_owner; - - PMCLOG_RESERVE(po, PCSAMPLE, sizeof(struct pmclog_pcsample)); - PMCLOG_EMIT32(ps->ps_pid); - PMCLOG_EMITADDR(ps->ps_pc); - PMCLOG_EMIT32(pm->pm_id); - PMCLOG_EMIT32(ps->ps_usermode); - PMCLOG_DESPATCH(po); -} - -void pmclog_process_pmcallocate(struct pmc *pm) { struct pmc_owner *po; diff --git a/sys/dev/hwpmc/hwpmc_mod.c b/sys/dev/hwpmc/hwpmc_mod.c index 6c0e0ea..0bba092 100644 --- a/sys/dev/hwpmc/hwpmc_mod.c +++ b/sys/dev/hwpmc/hwpmc_mod.c @@ -1,7 +1,11 @@ /*- - * Copyright (c) 2003-2006 Joseph Koshy + * Copyright (c) 2003-2007 Joseph Koshy + * Copyright (c) 2007 The FreeBSD Foundation * All rights reserved. * + * Portions of this software were developed by A. Joseph Koshy under + * sponsorship from the FreeBSD Foundation and Google, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -166,6 +170,7 @@ static int pmc_attach_one_process(struct proc *p, struct pmc *pm); static int pmc_can_allocate_rowindex(struct proc *p, unsigned int ri, int cpu); static int pmc_can_attach(struct pmc *pm, struct proc *p); +static void pmc_capture_user_callchain(int cpu, struct trapframe *tf); static void pmc_cleanup(void); static int pmc_detach_process(struct proc *p, struct pmc *pm); static int pmc_detach_one_process(struct proc *p, struct pmc *pm, @@ -180,6 +185,9 @@ static struct pmc_process *pmc_find_process_descriptor(struct proc *p, static void pmc_force_context_switch(void); static void pmc_link_target_process(struct pmc *pm, struct pmc_process *pp); +static void pmc_log_all_process_mappings(struct pmc_owner *po); +static void pmc_log_kernel_mappings(struct pmc *pm); +static void pmc_log_process_mappings(struct pmc_owner *po, struct proc *p); static void pmc_maybe_remove_owner(struct pmc_owner *po); static void pmc_process_csw_in(struct thread *td); static void pmc_process_csw_out(struct thread *td); @@ -205,6 +213,11 @@ static void pmc_unlink_target_process(struct pmc *pmc, SYSCTL_NODE(_kern, OID_AUTO, hwpmc, CTLFLAG_RW, 0, "HWPMC parameters"); +static int pmc_callchaindepth = PMC_CALLCHAIN_DEPTH; +TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "callchaindepth", &pmc_callchaindepth); +SYSCTL_INT(_kern_hwpmc, OID_AUTO, callchaindepth, CTLFLAG_TUN|CTLFLAG_RD, + &pmc_callchaindepth, 0, "depth of call chain records"); + #ifdef DEBUG struct pmc_debugflags pmc_debugflags = PMC_DEBUG_DEFAULT_FLAGS; char pmc_debugstr[PMC_DEBUG_STRSIZE]; @@ -226,7 +239,7 @@ SYSCTL_INT(_kern_hwpmc, OID_AUTO, hashsize, CTLFLAG_TUN|CTLFLAG_RD, &pmc_hashsize, 0, "rows in hash tables"); /* - * kern.hwpmc.nsamples --- number of PC samples per CPU + * kern.hwpmc.nsamples --- number of PC samples/callchain stacks per CPU */ static int pmc_nsamples = PMC_NSAMPLES; @@ -234,6 +247,7 @@ TUNABLE_INT(PMC_SYSCTL_NAME_PREFIX "nsamples", &pmc_nsamples); SYSCTL_INT(_kern_hwpmc, OID_AUTO, nsamples, CTLFLAG_TUN|CTLFLAG_RD, &pmc_nsamples, 0, "number of PC samples per CPU"); + /* * kern.hwpmc.mtxpoolsize -- number of mutexes in the mutex pool. */ @@ -957,6 +971,8 @@ pmc_attach_one_process(struct proc *p, struct pmc *pm) pmclog_process_pmcattach(pm, p->p_pid, fullpath); if (freepath) FREE(freepath, M_TEMP); + if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) + pmc_log_process_mappings(pm->pm_owner, p); } /* mark process as using HWPMCs */ PROC_LOCK(p); @@ -1449,7 +1465,7 @@ pmc_process_kld_unload(struct pmckern_map_out *pkm) if (po->po_flags & PMC_PO_OWNS_LOGFILE) pmclog_process_map_out(po, (pid_t) -1, pkm->pm_address, pkm->pm_address + pkm->pm_size); - + /* * TODO: Notify owners of process-sampling PMCs. */ @@ -1528,6 +1544,88 @@ pmc_process_munmap(struct thread *td, struct pmckern_map_out *pkm) } /* + * Log mapping information about the kernel. + */ + +static void +pmc_log_kernel_mappings(struct pmc *pm) +{ + struct pmc_owner *po; + struct pmckern_map_in *km, *kmbase; + + sx_assert(&pmc_sx, SX_LOCKED); + KASSERT(PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm)), + ("[pmc,%d] non-sampling PMC (%p) desires mapping information", + __LINE__, (void *) pm)); + + po = pm->pm_owner; + + if (po->po_flags & PMC_PO_INITIAL_MAPPINGS_DONE) + return; + + /* + * Log the current set of kernel modules. + */ + kmbase = linker_hwpmc_list_objects(); + for (km = kmbase; km->pm_file != NULL; km++) { + PMCDBG(LOG,REG,1,"%s %p", (char *) km->pm_file, + (void *) km->pm_address); + pmclog_process_map_in(po, (pid_t) -1, km->pm_address, + km->pm_file); + } + FREE(kmbase, M_LINKER); + + po->po_flags |= PMC_PO_INITIAL_MAPPINGS_DONE; +} + +/* + * Log the mappings for a single process. + */ + +static void +pmc_log_process_mappings(struct pmc_owner *po, struct proc *p) +{ +} + +/* + * Log mappings for all processes in the system. + */ + +static void +pmc_log_all_process_mappings(struct pmc_owner *po) +{ + struct proc *p, *top; + + sx_assert(&pmc_sx, SX_XLOCKED); + + if ((p = pfind(1)) == NULL) + panic("[pmc,%d] Cannot find init", __LINE__); + + PROC_UNLOCK(p); + + sx_slock(&proctree_lock); + + top = p; + + for (;;) { + pmc_log_process_mappings(po, p); + if (!LIST_EMPTY(&p->p_children)) + p = LIST_FIRST(&p->p_children); + else for (;;) { + if (p == top) + goto done; + if (LIST_NEXT(p, p_sibling)) { + p = LIST_NEXT(p, p_sibling); + break; + } + p = p->p_pptr; + } + } + done: + sx_sunlock(&proctree_lock); +} + +/* * The 'hook' invoked from the kernel proper */ @@ -1543,7 +1641,8 @@ const char *pmc_hooknames[] = { "KLDLOAD", "KLDUNLOAD", "MMAP", - "MUNMAP" + "MUNMAP", + "CALLCHAIN" }; #endif @@ -1726,6 +1825,14 @@ pmc_hook_handler(struct thread *td, int function, void *arg) pmc_process_munmap(td, (struct pmckern_map_out *) arg); break; + case PMC_FN_USER_CALLCHAIN: + /* + * Record a call chain. + */ + pmc_capture_user_callchain(PCPU_GET(cpuid), + (struct trapframe *) arg); + break; + default: #ifdef DEBUG KASSERT(0, ("[pmc,%d] unknown hook %d\n", __LINE__, function)); @@ -2321,6 +2428,21 @@ pmc_start(struct pmc *pm) po = pm->pm_owner; + /* + * Disallow PMCSTART if a logfile is required but has not been + * configured yet. + */ + if ((pm->pm_flags & PMC_F_NEEDS_LOGFILE) && + (po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) + return EDOOFUS; /* programming error */ + + /* + * If this is a sampling mode PMC, log mapping information for + * the kernel modules that are currently loaded. + */ + if (PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) + pmc_log_kernel_mappings(pm); + if (PMC_IS_VIRTUAL_MODE(mode)) { /* @@ -2333,15 +2455,6 @@ pmc_start(struct pmc *pm) pmc_attach_process(po->po_owner, pm); /* - * Disallow PMCSTART if a logfile is required but has not - * been configured yet. - */ - - if (error == 0 && (pm->pm_flags & PMC_F_NEEDS_LOGFILE) && - (po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) - error = EDOOFUS; - - /* * If the PMC is attached to its owner, then force a context * switch to ensure that the MD state gets set correctly. */ @@ -2358,13 +2471,7 @@ pmc_start(struct pmc *pm) /* * A system-wide PMC. - */ - - if ((pm->pm_flags & PMC_F_NEEDS_LOGFILE) && - (po->po_flags & PMC_PO_OWNS_LOGFILE) == 0) - return EDOOFUS; /* programming error */ - - /* + * * Add the owner to the global list if this is a system-wide * sampling PMC. */ @@ -2378,7 +2485,8 @@ pmc_start(struct pmc *pm) po->po_sscount++; } - /* TODO: dump system wide process mappings to the log? */ + /* Log mapping information for all processes in the system. */ + pmc_log_all_process_mappings(po); /* * Move to the CPU associated with this @@ -2554,7 +2662,6 @@ pmc_syscall_handler(struct thread *td, void *syscall_args) struct proc *p; struct pmc *pm; struct pmc_owner *po; - struct pmckern_map_in *km, *kmbase; struct pmc_op_configurelog cl; sx_assert(&pmc_sx, SX_XLOCKED); @@ -2593,18 +2700,6 @@ pmc_syscall_handler(struct thread *td, void *syscall_args) if (error) break; - - /* - * Log the current set of kernel modules. - */ - kmbase = linker_hwpmc_list_objects(); - for (km = kmbase; km->pm_file != NULL; km++) { - PMCDBG(LOG,REG,1,"%s %p", (char *) km->pm_file, - (void *) km->pm_address); - pmclog_process_map_in(po, (pid_t) -1, km->pm_address, - km->pm_file); - } - FREE(kmbase, M_LINKER); } break; @@ -2945,7 +3040,7 @@ pmc_syscall_handler(struct thread *td, void *syscall_args) */ if ((pa.pm_flags & ~(PMC_F_DESCENDANTS | PMC_F_LOG_PROCCSW | - PMC_F_LOG_PROCEXIT)) != 0) { + PMC_F_LOG_PROCEXIT | PMC_F_CALLCHAIN)) != 0) { error = EINVAL; break; } @@ -3633,56 +3728,117 @@ pmc_syscall_handler(struct thread *td, void *syscall_args) /* + * Mark the thread as needing callchain capture and post an AST. The + * actual callchain capture will be done in a context where it is safe + * to take page faults. + */ + +static void +pmc_post_callchain_ast(void) +{ + struct thread *td; + + td = curthread; + + /* + * Mark this thread as needing processing in ast(). + * td->td_pflags will be safe to touch as the process was in + * user space when it was interrupted. + */ + td->td_pflags |= TDP_CALLCHAIN; + + /* + * Again, since we've entered this function directly from + * userland, `td' is guaranteed to be not locked by this CPU, + * so its safe to try acquire the thread lock even though we + * are executing in an NMI context. We need to acquire this + * lock before touching `td_flags' because other CPUs may be + * in the process of touching this field. + */ + thread_lock(td); + td->td_flags |= TDF_ASTPENDING; + thread_unlock(td); + + return; +} + +/* * Interrupt processing. * - * Find a free slot in the per-cpu array of PC samples and write the - * current (PMC,PID,PC) triple to it. If an event was successfully - * added, a bit is set in mask 'pmc_cpumask' denoting that the - * DO_SAMPLES hook needs to be invoked from the clock handler. + * Find a free slot in the per-cpu array of samples and capture the + * current callchain there. If a sample was successfully added, a bit + * is set in mask 'pmc_cpumask' denoting that the DO_SAMPLES hook + * needs to be invoked from the clock handler. * * This function is meant to be called from an NMI handler. It cannot * use any of the locking primitives supplied by the OS. */ int -pmc_process_interrupt(int cpu, struct pmc *pm, uintfptr_t pc, int usermode) +pmc_process_interrupt(int cpu, struct pmc *pm, struct trapframe *tf, + int inuserspace) { - int error, ri; + int error, callchaindepth; struct thread *td; struct pmc_sample *ps; struct pmc_samplebuffer *psb; error = 0; - ri = PMC_TO_ROWINDEX(pm); + /* + * Allocate space for a sample buffer. + */ psb = pmc_pcpu[cpu]->pc_sb; ps = psb->ps_write; - if (ps->ps_pc) { /* in use, reader hasn't caught up */ + if (ps->ps_nsamples) { /* in use, reader hasn't caught up */ pm->pm_stalled = 1; atomic_add_int(&pmc_stats.pm_intr_bufferfull, 1); - PMCDBG(SAM,INT,1,"(spc) cpu=%d pm=%p pc=%jx um=%d wr=%d rd=%d", - cpu, pm, (uint64_t) pc, usermode, + PMCDBG(SAM,INT,1,"(spc) cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d", + cpu, pm, (void *) tf, inuserspace, (int) (psb->ps_write - psb->ps_samples), (int) (psb->ps_read - psb->ps_samples)); error = ENOMEM; goto done; } - /* fill in entry */ - PMCDBG(SAM,INT,1,"cpu=%d pm=%p pc=%jx um=%d wr=%d rd=%d", cpu, pm, - (uint64_t) pc, usermode, + + /* Fill in entry. */ + PMCDBG(SAM,INT,1,"cpu=%d pm=%p tf=%p um=%d wr=%d rd=%d", cpu, pm, + (void *) tf, inuserspace, (int) (psb->ps_write - psb->ps_samples), (int) (psb->ps_read - psb->ps_samples)); - atomic_add_rel_32(&pm->pm_runcount, 1); /* hold onto PMC */ + atomic_add_rel_32(&pm->pm_runcount, 1); /* hold onto PMC */ ps->ps_pmc = pm; if ((td = curthread) && td->td_proc) ps->ps_pid = td->td_proc->p_pid; else ps->ps_pid = -1; - ps->ps_usermode = usermode; - ps->ps_pc = pc; /* mark entry as in use */ + ps->ps_cpu = cpu; + ps->ps_flags = inuserspace ? PMC_CC_F_USERSPACE : 0; + + callchaindepth = (pm->pm_flags & PMC_F_CALLCHAIN) ? + pmc_callchaindepth : 1; + + if (callchaindepth == 1) + ps->ps_pc[0] = PMC_TRAPFRAME_TO_PC(tf); + else { + /* + * Kernel stack traversals can be done immediately, + * while we defer to an AST for user space traversals. + */ + if (!inuserspace) + callchaindepth = + pmc_save_kernel_callchain(ps->ps_pc, + callchaindepth, tf); + else { + pmc_post_callchain_ast(); + callchaindepth = PMC_SAMPLE_INUSE; + } + } + + ps->ps_nsamples = callchaindepth; /* mark entry as in use */ /* increment write pointer, modulo ring buffer size */ ps++; @@ -3695,7 +3851,50 @@ pmc_process_interrupt(int cpu, struct pmc *pm, uintfptr_t pc, int usermode) /* mark CPU as needing processing */ atomic_set_rel_int(&pmc_cpumask, (1 << cpu)); - return error; + return (error); +} + +/* + * Capture a user call chain. This function will be called from ast() + * before control returns to userland and before the process gets + * rescheduled. + */ + +static void +pmc_capture_user_callchain(int cpu, struct trapframe *tf) +{ + int i; + struct pmc *pm; + struct pmc_sample *ps; + struct pmc_samplebuffer *psb; + + psb = pmc_pcpu[cpu]->pc_sb; + + /* + * Iterate through all deferred callchain requests. + */ + + for (i = 0; i < pmc_nsamples; i++) { + + ps = &psb->ps_samples[i]; + if (ps->ps_nsamples != PMC_SAMPLE_INUSE) + continue; + + pm = ps->ps_pmc; + + KASSERT(pm->pm_flags & PMC_F_CALLCHAIN, + ("[pmc,%d] Retrieving callchain for PMC that doesn't " + "want it", __LINE__)); + + /* + * Retrieve the callchain and mark the sample buffer + * as 'processable' by the timer tick sweep code. + */ + ps->ps_nsamples = pmc_save_user_callchain(ps->ps_pc, + pmc_callchaindepth, tf); + } + + return; } @@ -3722,8 +3921,13 @@ pmc_process_samples(int cpu) for (n = 0; n < pmc_nsamples; n++) { /* bound on #iterations */ ps = psb->ps_read; - if (ps->ps_pc == (uintfptr_t) 0) /* no data */ + if (ps->ps_nsamples == PMC_SAMPLE_FREE) + break; + if (ps->ps_nsamples == PMC_SAMPLE_INUSE) { + /* Need a rescan at a later time. */ + atomic_set_rel_int(&pmc_cpumask, (1 << cpu)); break; + } pm = ps->ps_pmc; po = pm->pm_owner; @@ -3736,8 +3940,8 @@ pmc_process_samples(int cpu) if (pm->pm_state != PMC_STATE_RUNNING) goto entrydone; - PMCDBG(SAM,OPS,1,"cpu=%d pm=%p pc=%jx um=%d wr=%d rd=%d", cpu, - pm, (uint64_t) ps->ps_pc, ps->ps_usermode, + PMCDBG(SAM,OPS,1,"cpu=%d pm=%p n=%d fl=%x wr=%d rd=%d", cpu, + pm, ps->ps_nsamples, ps->ps_flags, (int) (psb->ps_write - psb->ps_samples), (int) (psb->ps_read - psb->ps_samples)); @@ -3748,9 +3952,9 @@ pmc_process_samples(int cpu) * would have done. */ if (pm->pm_flags & PMC_F_ATTACHED_TO_OWNER) { - if (ps->ps_usermode) { + if (ps->ps_flags & PMC_CC_F_USERSPACE) { td = FIRST_THREAD_IN_PROC(po->po_owner); - addupc_intr(td, ps->ps_pc, 1); + addupc_intr(td, ps->ps_pc[0], 1); } goto entrydone; } @@ -3762,10 +3966,10 @@ pmc_process_samples(int cpu) * entry to the PMC's owner process. */ - pmclog_process_pcsample(pm, ps); + pmclog_process_callchain(pm, ps); entrydone: - ps->ps_pc = (uintfptr_t) 0; /* mark entry as free */ + ps->ps_nsamples = 0; /* mark entry as free */ atomic_subtract_rel_32(&pm->pm_runcount, 1); /* increment read pointer, modulo sample size */ @@ -4087,6 +4291,7 @@ pmc_initialize(void) { int cpu, error, n; struct pmc_binding pb; + struct pmc_sample *ps; struct pmc_samplebuffer *sb; md = NULL; @@ -4119,17 +4324,24 @@ pmc_initialize(void) */ if (pmc_hashsize <= 0) { - (void) printf("hwpmc: tunable hashsize=%d must be greater " - "than zero.\n", pmc_hashsize); + (void) printf("hwpmc: tunable \"hashsize\"=%d must be " + "greater than zero.\n", pmc_hashsize); pmc_hashsize = PMC_HASH_SIZE; } if (pmc_nsamples <= 0 || pmc_nsamples > 65535) { - (void) printf("hwpmc: tunable nsamples=%d out of range.\n", - pmc_nsamples); + (void) printf("hwpmc: tunable \"nsamples\"=%d out of " + "range.\n", pmc_nsamples); pmc_nsamples = PMC_NSAMPLES; } + if (pmc_callchaindepth <= 0 || + pmc_callchaindepth > PMC_CALLCHAIN_DEPTH_MAX) { + (void) printf("hwpmc: tunable \"callchaindepth\"=%d out of " + "range.\n", pmc_callchaindepth); + pmc_callchaindepth = PMC_CALLCHAIN_DEPTH; + } + md = pmc_md_initialize(); if (md == NULL || md->pmd_init == NULL) @@ -4171,6 +4383,14 @@ pmc_initialize(void) KASSERT(pmc_pcpu[cpu] != NULL, ("[pmc,%d] cpu=%d Null per-cpu data", __LINE__, cpu)); + MALLOC(sb->ps_callchains, uintptr_t *, + pmc_callchaindepth * pmc_nsamples * sizeof(uintptr_t), + M_PMC, M_WAITOK|M_ZERO); + + for (n = 0, ps = sb->ps_samples; n < pmc_nsamples; n++, ps++) + ps->ps_pc = sb->ps_callchains + + (n * pmc_callchaindepth); + pmc_pcpu[cpu]->pc_sb = sb; } @@ -4327,6 +4547,7 @@ pmc_cleanup(void) KASSERT(pmc_pcpu[cpu]->pc_sb != NULL, ("[pmc,%d] Null cpu sample buffer cpu=%d", __LINE__, cpu)); + FREE(pmc_pcpu[cpu]->pc_sb->ps_callchains, M_PMC); FREE(pmc_pcpu[cpu]->pc_sb, M_PMC); pmc_pcpu[cpu]->pc_sb = NULL; } diff --git a/sys/dev/hwpmc/hwpmc_piv.c b/sys/dev/hwpmc/hwpmc_piv.c index a6cd3fe..7994330 100644 --- a/sys/dev/hwpmc/hwpmc_piv.c +++ b/sys/dev/hwpmc/hwpmc_piv.c @@ -1,7 +1,11 @@ /*- - * Copyright (c) 2003-2005 Joseph Koshy + * Copyright (c) 2003-2007 Joseph Koshy + * Copyright (c) 2007 The FreeBSD Foundation * All rights reserved. * + * Portions of this software were developed by A. Joseph Koshy under + * sponsorship from the FreeBSD Foundation and Google, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -35,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -1478,7 +1483,7 @@ p4_stop_pmc(int cpu, int ri) * * On HTT machines, this PMC may be in use by two threads * running on two logical CPUS. Thus we look at the - * 'pm_runcount' field and only turn off the appropriate TO/T1 + * 'runcount' field and only turn off the appropriate TO/T1 * bits (and keep the PMC running) if two logical CPUs were * using the PMC. * @@ -1562,16 +1567,17 @@ p4_stop_pmc(int cpu, int ri) */ static int -p4_intr(int cpu, uintptr_t eip, int usermode) +p4_intr(int cpu, struct trapframe *tf) { - int i, did_interrupt, error, ri; uint32_t cccrval, ovf_mask, ovf_partner; - struct p4_cpu *pc; + int i, did_interrupt, error, ri; struct pmc_hw *phw; + struct p4_cpu *pc; struct pmc *pm; pmc_value_t v; - PMCDBG(MDP,INT, 1, "cpu=%d eip=%p um=%d", cpu, (void *) eip, usermode); + PMCDBG(MDP,INT, 1, "cpu=%d tf=0x%p um=%d", cpu, (void *) tf, + TRAPF_USERMODE(tf)); pc = (struct p4_cpu *) pmc_pcpu[P4_TO_HTT_PRIMARY(cpu)]; @@ -1579,8 +1585,8 @@ p4_intr(int cpu, uintptr_t eip, int usermode) P4_CCCR_OVF_PMI_T1 : P4_CCCR_OVF_PMI_T0; ovf_mask |= P4_CCCR_OVF; if (p4_system_has_htt) - ovf_partner = P4_CPU_IS_HTT_SECONDARY(cpu) ? P4_CCCR_OVF_PMI_T0 : - P4_CCCR_OVF_PMI_T1; + ovf_partner = P4_CPU_IS_HTT_SECONDARY(cpu) ? + P4_CCCR_OVF_PMI_T0 : P4_CCCR_OVF_PMI_T1; else ovf_partner = 0; did_interrupt = 0; @@ -1617,7 +1623,8 @@ p4_intr(int cpu, uintptr_t eip, int usermode) !PMC_IS_SAMPLING_MODE(PMC_TO_MODE(pm))) { continue; } - (void) pmc_process_interrupt(cpu, pm, eip, usermode); + (void) pmc_process_interrupt(cpu, pm, tf, + TRAPF_USERMODE(tf)); continue; } @@ -1667,7 +1674,8 @@ p4_intr(int cpu, uintptr_t eip, int usermode) * Process the interrupt. Re-enable the PMC if * processing was successful. */ - error = pmc_process_interrupt(cpu, pm, eip, usermode); + error = pmc_process_interrupt(cpu, pm, tf, + TRAPF_USERMODE(tf)); /* * Only the first processor executing the NMI handler @@ -1698,7 +1706,7 @@ p4_intr(int cpu, uintptr_t eip, int usermode) atomic_add_int(did_interrupt ? &pmc_stats.pm_intr_processed : &pmc_stats.pm_intr_ignored, 1); - return did_interrupt; + return (did_interrupt); } /* diff --git a/sys/dev/hwpmc/hwpmc_ppro.c b/sys/dev/hwpmc/hwpmc_ppro.c index 993a2bf..979c04e 100644 --- a/sys/dev/hwpmc/hwpmc_ppro.c +++ b/sys/dev/hwpmc/hwpmc_ppro.c @@ -1,7 +1,11 @@ /*- * Copyright (c) 2003-2005 Joseph Koshy + * Copyright (c) 2007 The FreeBSD Foundation * All rights reserved. * + * Portions of this software were developed by A. Joseph Koshy under + * sponsorship from the FreeBSD Foundation and Google, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -35,6 +39,7 @@ __FBSDID("$FreeBSD$"); #include #include +#include #include #include #include @@ -710,7 +715,7 @@ p6_stop_pmc(int cpu, int ri) } static int -p6_intr(int cpu, uintptr_t eip, int usermode) +p6_intr(int cpu, struct trapframe *tf) { int i, error, retval, ri; uint32_t perf0cfg; @@ -745,7 +750,8 @@ p6_intr(int cpu, uintptr_t eip, int usermode) retval = 1; - error = pmc_process_interrupt(cpu, pm, eip, usermode); + error = pmc_process_interrupt(cpu, pm, tf, + TRAPF_USERMODE(tf)); if (error) P6_MARK_STOPPED(pc,ri); diff --git a/sys/dev/hwpmc/hwpmc_x86.c b/sys/dev/hwpmc/hwpmc_x86.c index cb6db23..2fc7cd9 100644 --- a/sys/dev/hwpmc/hwpmc_x86.c +++ b/sys/dev/hwpmc/hwpmc_x86.c @@ -1,7 +1,11 @@ /*- * Copyright (c) 2005, Joseph Koshy + * Copyright (c) 2007 The FreeBSD Foundation * All rights reserved. * + * Portions of this software were developed by A. Joseph Koshy under + * sponsorship from the FreeBSD Foundation and Google, Inc. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -30,12 +34,18 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include +#include #include #include #include +#include +#include +#include + extern volatile lapic_t *lapic; void @@ -48,6 +58,187 @@ pmc_x86_lapic_enable_pmc_interrupt(void) lapic->lvt_pcint = value; } +/* + * Attempt to walk a user call stack using a too-simple algorithm. + * In the general case we need unwind information associated with + * the executable to be able to walk the user stack. + * + * We are handed a trap frame laid down at the time the PMC interrupt + * was taken. If the application is using frame pointers, the saved + * PC value could be: + * a. at the beginning of a function before the stack frame is laid + * down, + * b. just before a 'ret', after the stack frame has been taken off, + * c. somewhere else in the function with a valid stack frame being + * present, + * + * If the application is not using frame pointers, this algorithm will + * fail to yield an interesting call chain. + * + * TODO: figure out a way to use unwind information. + */ + +int +pmc_save_user_callchain(uintptr_t *cc, int nframes, struct trapframe *tf) +{ + int n; + uint32_t instr; + uintptr_t fp, oldfp, pc, r, sp; + + KASSERT(TRAPF_USERMODE(tf), ("[x86,%d] Not a user trap frame tf=%p", + __LINE__, (void *) tf)); + + pc = PMC_TRAPFRAME_TO_PC(tf); + oldfp = fp = PMC_TRAPFRAME_TO_FP(tf); + sp = PMC_TRAPFRAME_TO_SP(tf); + + *cc++ = pc; n = 1; + + r = fp + sizeof(uintptr_t); /* points to return address */ + + if (!PMC_IN_USERSPACE(pc)) + return (n); + + if (copyin((void *) pc, &instr, sizeof(instr)) != 0) + return (n); + + if (PMC_AT_FUNCTION_PROLOGUE_PUSH_BP(instr) || + PMC_AT_FUNCTION_EPILOGUE_RET(instr)) { /* ret */ + if (copyin((void *) sp, &pc, sizeof(pc)) != 0) + return (n); + } else if (PMC_AT_FUNCTION_PROLOGUE_MOV_SP_BP(instr)) { + sp += sizeof(uintptr_t); + if (copyin((void *) sp, &pc, sizeof(pc)) != 0) + return (n); + } else if (copyin((void *) r, &pc, sizeof(pc)) != 0 || + copyin((void *) fp, &fp, sizeof(fp) != 0)) + return (n); + + for (; n < nframes;) { + if (pc == 0 || !PMC_IN_USERSPACE(pc)) + break; + + *cc++ = pc; n++; + + if (fp < oldfp) + break; + + r = fp + sizeof(uintptr_t); /* address of return address */ + oldfp = fp; + + if (copyin((void *) r, &pc, sizeof(pc)) != 0 || + copyin((void *) fp, &fp, sizeof(fp)) != 0) + break; + } + + return (n); +} + +/* + * Walking the kernel call stack. + * + * We are handed the trap frame laid down at the time the PMC + * interrupt was taken. The saved PC could be: + * a. in the lowlevel trap handler, meaning that there isn't a C stack + * to traverse, + * b. at the beginning of a function before the stack frame is laid + * down, + * c. just before a 'ret', after the stack frame has been taken off, + * d. somewhere else in a function with a valid stack frame being + * present. + * + * In case (d), the previous frame pointer is at [%ebp]/[%rbp] and + * the return address is at [%ebp+4]/[%rbp+8]. + * + * For cases (b) and (c), the return address is at [%esp]/[%rsp] and + * the frame pointer doesn't need to be changed when going up one + * level in the stack. + * + * For case (a), we check if the PC lies in low-level trap handling + * code, and if so we terminate our trace. + */ + +int +pmc_save_kernel_callchain(uintptr_t *cc, int nframes, struct trapframe *tf) +{ + int n; + uint32_t instr; + uintptr_t fp, pc, r, sp, stackstart, stackend; + struct thread *td; + + KASSERT(TRAPF_USERMODE(tf) == 0,("[x86,%d] not a kernel backtrace", + __LINE__)); + + pc = PMC_TRAPFRAME_TO_PC(tf); + fp = PMC_TRAPFRAME_TO_FP(tf); + sp = PMC_TRAPFRAME_TO_SP(tf); + + *cc++ = pc; + r = fp + sizeof(uintptr_t); /* points to return address */ + + if ((td = curthread) == NULL) + return (1); + + if (nframes <= 1) + return (1); + + stackstart = (uintptr_t) td->td_kstack; + stackend = (uintptr_t) td->td_kstack + td->td_kstack_pages * PAGE_SIZE; + + if (PMC_IN_TRAP_HANDLER(pc) || + !PMC_IN_KERNEL(pc) || !PMC_IN_KERNEL(r) || + !PMC_IN_KERNEL_STACK(sp, stackstart, stackend) || + !PMC_IN_KERNEL_STACK(fp, stackstart, stackend)) + return (1); + + instr = *(uint32_t *) pc; + + /* + * Determine whether the interrupted function was in the + * processing of either laying down its stack frame or taking + * it off. + * + * If we haven't started laying down a stack frame, or are + * just about to return, then our caller's address is at + * *sp, and we don't have a frame to unwind. + */ + if (PMC_AT_FUNCTION_PROLOGUE_PUSH_BP(instr) || + PMC_AT_FUNCTION_EPILOGUE_RET(instr)) + pc = *(uintptr_t *) sp; + else if (PMC_AT_FUNCTION_PROLOGUE_MOV_SP_BP(instr)) { + /* + * The code was midway through laying down a frame. + * At this point sp[0] has a frame back pointer, + * and the caller's address is therefore at sp[1]. + */ + sp += sizeof(uintptr_t); + if (!PMC_IN_KERNEL_STACK(sp, stackstart, stackend)) + return (1); + pc = *(uintptr_t *) sp; + } else { + /* + * Not in the function prologue or epilogue. + */ + pc = *(uintptr_t *) r; + fp = *(uintptr_t *) fp; + } + + for (n = 1; n < nframes; n++) { + *cc++ = pc; + + if (PMC_IN_TRAP_HANDLER(pc)) + break; + + r = fp + sizeof(uintptr_t); + if (!PMC_IN_KERNEL_STACK(fp, stackstart, stackend) || + !PMC_IN_KERNEL(r)) + break; + pc = *(uintptr_t *) r; + fp = *(uintptr_t *) fp; + } + + return (n); +} static struct pmc_mdep * pmc_intel_initialize(void) -- cgit v1.1