diff options
Diffstat (limited to 'lib/libpmc')
38 files changed, 19089 insertions, 0 deletions
diff --git a/lib/libpmc/Makefile b/lib/libpmc/Makefile new file mode 100644 index 0000000..85ddf0f --- /dev/null +++ b/lib/libpmc/Makefile @@ -0,0 +1,72 @@ +# $FreeBSD$ + +LIB= pmc + +SRCS= libpmc.c pmclog.c +INCS= pmc.h pmclog.h + +MAN= pmc.3 +MAN+= pmc_allocate.3 +MAN+= pmc_attach.3 +MAN+= pmc_capabilities.3 +MAN+= pmc_configure_logfile.3 +MAN+= pmc_disable.3 +MAN+= pmc_event_names_of_class.3 +MAN+= pmc_get_driver_stats.3 +MAN+= pmc_get_msr.3 +MAN+= pmc_init.3 +MAN+= pmc_name_of_capability.3 +MAN+= pmc_read.3 +MAN+= pmc_set.3 +MAN+= pmc_start.3 +MAN+= pmclog.3 + +# PMC-dependent manual pages +.if ${MACHINE_CPUARCH} == "i386" || ${MACHINE_CPUARCH} == "amd64" +MAN+= pmc.atom.3 +MAN+= pmc.core.3 +MAN+= pmc.core2.3 +MAN+= pmc.iaf.3 +MAN+= pmc.ucf.3 +MAN+= pmc.k7.3 +MAN+= pmc.k8.3 +MAN+= pmc.p4.3 +MAN+= pmc.p5.3 +MAN+= pmc.p6.3 +MAN+= pmc.corei7.3 +MAN+= pmc.corei7uc.3 +MAN+= pmc.westmere.3 +MAN+= pmc.westmereuc.3 +MAN+= pmc.tsc.3 +.elif ${MACHINE_CPUARCH} == "arm" && ${CPUTYPE} == "xscale" +MAN+= pmc.xscale.3 +.endif + +MLINKS+= \ + pmc_allocate.3 pmc_release.3 \ + pmc_attach.3 pmc_detach.3 \ + pmc_capabilities.3 pmc_ncpu.3 \ + pmc_capabilities.3 pmc_npmc.3 \ + pmc_capabilities.3 pmc_pmcinfo.3 \ + pmc_capabilities.3 pmc_cpuinfo.3 \ + pmc_capabilities.3 pmc_width.3 \ + pmc_configure_logfile.3 pmc_flush_logfile.3 \ + pmc_configure_logfile.3 pmc_writelog.3 \ + pmc_disable.3 pmc_enable.3 \ + pmc_name_of_capability.3 pmc_name_of_class.3 \ + pmc_name_of_capability.3 pmc_name_of_cputype.3 \ + pmc_name_of_capability.3 pmc_name_of_disposition.3 \ + pmc_name_of_capability.3 pmc_name_of_event.3 \ + pmc_name_of_capability.3 pmc_name_of_mode.3 \ + pmc_name_of_capability.3 pmc_name_of_state.3 \ + pmc_read.3 pmc_rw.3 \ + pmc_read.3 pmc_write.3 \ + pmc_start.3 pmc_stop.3 + +MLINKS+= \ + pmclog.3 pmclog_open.3 \ + pmclog.3 pmclog_close.3 \ + pmclog.3 pmclog_feed.3 \ + pmclog.3 pmclog_read.3 + +.include <bsd.lib.mk> diff --git a/lib/libpmc/libpmc.c b/lib/libpmc/libpmc.c new file mode 100644 index 0000000..1d86a82 --- /dev/null +++ b/lib/libpmc/libpmc.c @@ -0,0 +1,3127 @@ +/*- + * Copyright (c) 2003-2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/types.h> +#include <sys/module.h> +#include <sys/pmc.h> +#include <sys/syscall.h> + +#include <ctype.h> +#include <errno.h> +#include <fcntl.h> +#include <pmc.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <unistd.h> + +#include "libpmcinternal.h" + +/* Function prototypes */ +#if defined(__i386__) +static int k7_allocate_pmc(enum pmc_event _pe, char *_ctrspec, + struct pmc_op_pmcallocate *_pmc_config); +#endif +#if defined(__amd64__) || defined(__i386__) +static int iaf_allocate_pmc(enum pmc_event _pe, char *_ctrspec, + struct pmc_op_pmcallocate *_pmc_config); +static int iap_allocate_pmc(enum pmc_event _pe, char *_ctrspec, + struct pmc_op_pmcallocate *_pmc_config); +static int ucf_allocate_pmc(enum pmc_event _pe, char *_ctrspec, + struct pmc_op_pmcallocate *_pmc_config); +static int ucp_allocate_pmc(enum pmc_event _pe, char *_ctrspec, + struct pmc_op_pmcallocate *_pmc_config); +static int k8_allocate_pmc(enum pmc_event _pe, char *_ctrspec, + struct pmc_op_pmcallocate *_pmc_config); +static int p4_allocate_pmc(enum pmc_event _pe, char *_ctrspec, + struct pmc_op_pmcallocate *_pmc_config); +#endif +#if defined(__i386__) +static int p5_allocate_pmc(enum pmc_event _pe, char *_ctrspec, + struct pmc_op_pmcallocate *_pmc_config); +static int p6_allocate_pmc(enum pmc_event _pe, char *_ctrspec, + struct pmc_op_pmcallocate *_pmc_config); +#endif +#if defined(__amd64__) || defined(__i386__) +static int tsc_allocate_pmc(enum pmc_event _pe, char *_ctrspec, + struct pmc_op_pmcallocate *_pmc_config); +#endif +#if defined(__XSCALE__) +static int xscale_allocate_pmc(enum pmc_event _pe, char *_ctrspec, + struct pmc_op_pmcallocate *_pmc_config); +#endif + +#if defined(__mips__) +static int mips24k_allocate_pmc(enum pmc_event _pe, char* ctrspec, + struct pmc_op_pmcallocate *_pmc_config); +#endif /* __mips__ */ + + +#define PMC_CALL(cmd, params) \ + syscall(pmc_syscall, PMC_OP_##cmd, (params)) + +/* + * Event aliases provide a way for the user to ask for generic events + * like "cache-misses", or "instructions-retired". These aliases are + * mapped to the appropriate canonical event descriptions using a + * lookup table. + */ +struct pmc_event_alias { + const char *pm_alias; + const char *pm_spec; +}; + +static const struct pmc_event_alias *pmc_mdep_event_aliases; + +/* + * The pmc_event_descr structure maps symbolic names known to the user + * to integer codes used by the PMC KLD. + */ +struct pmc_event_descr { + const char *pm_ev_name; + enum pmc_event pm_ev_code; +}; + +/* + * The pmc_class_descr structure maps class name prefixes for + * event names to event tables and other PMC class data. + */ +struct pmc_class_descr { + const char *pm_evc_name; + size_t pm_evc_name_size; + enum pmc_class pm_evc_class; + const struct pmc_event_descr *pm_evc_event_table; + size_t pm_evc_event_table_size; + int (*pm_evc_allocate_pmc)(enum pmc_event _pe, + char *_ctrspec, struct pmc_op_pmcallocate *_pa); +}; + +#define PMC_TABLE_SIZE(N) (sizeof(N)/sizeof(N[0])) +#define PMC_EVENT_TABLE_SIZE(N) PMC_TABLE_SIZE(N##_event_table) + +#undef __PMC_EV +#define __PMC_EV(C,N) { #N, PMC_EV_ ## C ## _ ## N }, + +/* + * PMC_CLASSDEP_TABLE(NAME, CLASS) + * + * Define a table mapping event names and aliases to HWPMC event IDs. + */ +#define PMC_CLASSDEP_TABLE(N, C) \ + static const struct pmc_event_descr N##_event_table[] = \ + { \ + __PMC_EV_##C() \ + } + +PMC_CLASSDEP_TABLE(iaf, IAF); +PMC_CLASSDEP_TABLE(k7, K7); +PMC_CLASSDEP_TABLE(k8, K8); +PMC_CLASSDEP_TABLE(p4, P4); +PMC_CLASSDEP_TABLE(p5, P5); +PMC_CLASSDEP_TABLE(p6, P6); +PMC_CLASSDEP_TABLE(xscale, XSCALE); +PMC_CLASSDEP_TABLE(mips24k, MIPS24K); +PMC_CLASSDEP_TABLE(ucf, UCF); + +#undef __PMC_EV_ALIAS +#define __PMC_EV_ALIAS(N,CODE) { N, PMC_EV_##CODE }, + +static const struct pmc_event_descr atom_event_table[] = +{ + __PMC_EV_ALIAS_ATOM() +}; + +static const struct pmc_event_descr core_event_table[] = +{ + __PMC_EV_ALIAS_CORE() +}; + + +static const struct pmc_event_descr core2_event_table[] = +{ + __PMC_EV_ALIAS_CORE2() +}; + +static const struct pmc_event_descr corei7_event_table[] = +{ + __PMC_EV_ALIAS_COREI7() +}; + +static const struct pmc_event_descr westmere_event_table[] = +{ + __PMC_EV_ALIAS_WESTMERE() +}; + +static const struct pmc_event_descr corei7uc_event_table[] = +{ + __PMC_EV_ALIAS_COREI7UC() +}; + +static const struct pmc_event_descr westmereuc_event_table[] = +{ + __PMC_EV_ALIAS_WESTMEREUC() +}; + +/* + * PMC_MDEP_TABLE(NAME, PRIMARYCLASS, ADDITIONAL_CLASSES...) + * + * Map a CPU to the PMC classes it supports. + */ +#define PMC_MDEP_TABLE(N,C,...) \ + static const enum pmc_class N##_pmc_classes[] = { \ + PMC_CLASS_##C, __VA_ARGS__ \ + } + +PMC_MDEP_TABLE(atom, IAP, PMC_CLASS_IAF, PMC_CLASS_TSC); +PMC_MDEP_TABLE(core, IAP, PMC_CLASS_TSC); +PMC_MDEP_TABLE(core2, IAP, PMC_CLASS_IAF, PMC_CLASS_TSC); +PMC_MDEP_TABLE(corei7, IAP, PMC_CLASS_IAF, PMC_CLASS_TSC, PMC_CLASS_UCF, PMC_CLASS_UCP); +PMC_MDEP_TABLE(westmere, IAP, PMC_CLASS_IAF, PMC_CLASS_TSC, PMC_CLASS_UCF, PMC_CLASS_UCP); +PMC_MDEP_TABLE(k7, K7, PMC_CLASS_TSC); +PMC_MDEP_TABLE(k8, K8, PMC_CLASS_TSC); +PMC_MDEP_TABLE(p4, P4, PMC_CLASS_TSC); +PMC_MDEP_TABLE(p5, P5, PMC_CLASS_TSC); +PMC_MDEP_TABLE(p6, P6, PMC_CLASS_TSC); +PMC_MDEP_TABLE(xscale, XSCALE, PMC_CLASS_XSCALE); +PMC_MDEP_TABLE(mips24k, MIPS24K, PMC_CLASS_MIPS24K); + +static const struct pmc_event_descr tsc_event_table[] = +{ + __PMC_EV_TSC() +}; + +#undef PMC_CLASS_TABLE_DESC +#define PMC_CLASS_TABLE_DESC(NAME, CLASS, EVENTS, ALLOCATOR) \ +static const struct pmc_class_descr NAME##_class_table_descr = \ + { \ + .pm_evc_name = #CLASS "-", \ + .pm_evc_name_size = sizeof(#CLASS "-") - 1, \ + .pm_evc_class = PMC_CLASS_##CLASS , \ + .pm_evc_event_table = EVENTS##_event_table , \ + .pm_evc_event_table_size = \ + PMC_EVENT_TABLE_SIZE(EVENTS), \ + .pm_evc_allocate_pmc = ALLOCATOR##_allocate_pmc \ + } + +#if defined(__i386__) || defined(__amd64__) +PMC_CLASS_TABLE_DESC(iaf, IAF, iaf, iaf); +PMC_CLASS_TABLE_DESC(atom, IAP, atom, iap); +PMC_CLASS_TABLE_DESC(core, IAP, core, iap); +PMC_CLASS_TABLE_DESC(core2, IAP, core2, iap); +PMC_CLASS_TABLE_DESC(corei7, IAP, corei7, iap); +PMC_CLASS_TABLE_DESC(westmere, IAP, westmere, iap); +PMC_CLASS_TABLE_DESC(ucf, UCF, ucf, ucf); +PMC_CLASS_TABLE_DESC(corei7uc, UCP, corei7uc, ucp); +PMC_CLASS_TABLE_DESC(westmereuc, UCP, westmereuc, ucp); +#endif +#if defined(__i386__) +PMC_CLASS_TABLE_DESC(k7, K7, k7, k7); +#endif +#if defined(__i386__) || defined(__amd64__) +PMC_CLASS_TABLE_DESC(k8, K8, k8, k8); +PMC_CLASS_TABLE_DESC(p4, P4, p4, p4); +#endif +#if defined(__i386__) +PMC_CLASS_TABLE_DESC(p5, P5, p5, p5); +PMC_CLASS_TABLE_DESC(p6, P6, p6, p6); +#endif +#if defined(__i386__) || defined(__amd64__) +PMC_CLASS_TABLE_DESC(tsc, TSC, tsc, tsc); +#endif +#if defined(__XSCALE__) +PMC_CLASS_TABLE_DESC(xscale, XSCALE, xscale, xscale); +#endif + +#if defined(__mips__) +PMC_CLASS_TABLE_DESC(mips24k, MIPS24K, mips24k, mips24k); +#endif /* __mips__ */ + +#undef PMC_CLASS_TABLE_DESC + +static const struct pmc_class_descr **pmc_class_table; +#define PMC_CLASS_TABLE_SIZE cpu_info.pm_nclass + +static const enum pmc_class *pmc_mdep_class_list; +static size_t pmc_mdep_class_list_size; + +/* + * Mapping tables, mapping enumeration values to human readable + * strings. + */ + +static const char * pmc_capability_names[] = { +#undef __PMC_CAP +#define __PMC_CAP(N,V,D) #N , + __PMC_CAPS() +}; + +static const char * pmc_class_names[] = { +#undef __PMC_CLASS +#define __PMC_CLASS(C) #C , + __PMC_CLASSES() +}; + +struct pmc_cputype_map { + enum pmc_class pm_cputype; + const char *pm_name; +}; + +static const struct pmc_cputype_map pmc_cputype_names[] = { +#undef __PMC_CPU +#define __PMC_CPU(S, V, D) { .pm_cputype = PMC_CPU_##S, .pm_name = #S } , + __PMC_CPUS() +}; + +static const char * pmc_disposition_names[] = { +#undef __PMC_DISP +#define __PMC_DISP(D) #D , + __PMC_DISPOSITIONS() +}; + +static const char * pmc_mode_names[] = { +#undef __PMC_MODE +#define __PMC_MODE(M,N) #M , + __PMC_MODES() +}; + +static const char * pmc_state_names[] = { +#undef __PMC_STATE +#define __PMC_STATE(S) #S , + __PMC_STATES() +}; + +static int pmc_syscall = -1; /* filled in by pmc_init() */ + +static struct pmc_cpuinfo cpu_info; /* filled in by pmc_init() */ + +/* Event masks for events */ +struct pmc_masks { + const char *pm_name; + const uint32_t pm_value; +}; +#define PMCMASK(N,V) { .pm_name = #N, .pm_value = (V) } +#define NULLMASK { .pm_name = NULL } + +#if defined(__amd64__) || defined(__i386__) +static int +pmc_parse_mask(const struct pmc_masks *pmask, char *p, uint32_t *evmask) +{ + const struct pmc_masks *pm; + char *q, *r; + int c; + + if (pmask == NULL) /* no mask keywords */ + return (-1); + q = strchr(p, '='); /* skip '=' */ + if (*++q == '\0') /* no more data */ + return (-1); + c = 0; /* count of mask keywords seen */ + while ((r = strsep(&q, "+")) != NULL) { + for (pm = pmask; pm->pm_name && strcasecmp(r, pm->pm_name); + pm++) + ; + if (pm->pm_name == NULL) /* not found */ + return (-1); + *evmask |= pm->pm_value; + c++; + } + return (c); +} +#endif + +#define KWMATCH(p,kw) (strcasecmp((p), (kw)) == 0) +#define KWPREFIXMATCH(p,kw) (strncasecmp((p), (kw), sizeof((kw)) - 1) == 0) +#define EV_ALIAS(N,S) { .pm_alias = N, .pm_spec = S } + +#if defined(__i386__) + +/* + * AMD K7 (Athlon) CPUs. + */ + +static struct pmc_event_alias k7_aliases[] = { + EV_ALIAS("branches", "k7-retired-branches"), + EV_ALIAS("branch-mispredicts", "k7-retired-branches-mispredicted"), + EV_ALIAS("cycles", "tsc"), + EV_ALIAS("dc-misses", "k7-dc-misses"), + EV_ALIAS("ic-misses", "k7-ic-misses"), + EV_ALIAS("instructions", "k7-retired-instructions"), + EV_ALIAS("interrupts", "k7-hardware-interrupts"), + EV_ALIAS(NULL, NULL) +}; + +#define K7_KW_COUNT "count" +#define K7_KW_EDGE "edge" +#define K7_KW_INV "inv" +#define K7_KW_OS "os" +#define K7_KW_UNITMASK "unitmask" +#define K7_KW_USR "usr" + +static int +k7_allocate_pmc(enum pmc_event pe, char *ctrspec, + struct pmc_op_pmcallocate *pmc_config) +{ + char *e, *p, *q; + int c, has_unitmask; + uint32_t count, unitmask; + + pmc_config->pm_md.pm_amd.pm_amd_config = 0; + pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE); + + if (pe == PMC_EV_K7_DC_REFILLS_FROM_L2 || + pe == PMC_EV_K7_DC_REFILLS_FROM_SYSTEM || + pe == PMC_EV_K7_DC_WRITEBACKS) { + has_unitmask = 1; + unitmask = AMD_PMC_UNITMASK_MOESI; + } else + unitmask = has_unitmask = 0; + + while ((p = strsep(&ctrspec, ",")) != NULL) { + if (KWPREFIXMATCH(p, K7_KW_COUNT "=")) { + q = strchr(p, '='); + if (*++q == '\0') /* skip '=' */ + return (-1); + + count = strtol(q, &e, 0); + if (e == q || *e != '\0') + return (-1); + + pmc_config->pm_caps |= PMC_CAP_THRESHOLD; + pmc_config->pm_md.pm_amd.pm_amd_config |= + AMD_PMC_TO_COUNTER(count); + + } else if (KWMATCH(p, K7_KW_EDGE)) { + pmc_config->pm_caps |= PMC_CAP_EDGE; + } else if (KWMATCH(p, K7_KW_INV)) { + pmc_config->pm_caps |= PMC_CAP_INVERT; + } else if (KWMATCH(p, K7_KW_OS)) { + pmc_config->pm_caps |= PMC_CAP_SYSTEM; + } else if (KWPREFIXMATCH(p, K7_KW_UNITMASK "=")) { + if (has_unitmask == 0) + return (-1); + unitmask = 0; + q = strchr(p, '='); + if (*++q == '\0') /* skip '=' */ + return (-1); + + while ((c = tolower(*q++)) != 0) + if (c == 'm') + unitmask |= AMD_PMC_UNITMASK_M; + else if (c == 'o') + unitmask |= AMD_PMC_UNITMASK_O; + else if (c == 'e') + unitmask |= AMD_PMC_UNITMASK_E; + else if (c == 's') + unitmask |= AMD_PMC_UNITMASK_S; + else if (c == 'i') + unitmask |= AMD_PMC_UNITMASK_I; + else if (c == '+') + continue; + else + return (-1); + + if (unitmask == 0) + return (-1); + + } else if (KWMATCH(p, K7_KW_USR)) { + pmc_config->pm_caps |= PMC_CAP_USER; + } else + return (-1); + } + + if (has_unitmask) { + pmc_config->pm_caps |= PMC_CAP_QUALIFIER; + pmc_config->pm_md.pm_amd.pm_amd_config |= + AMD_PMC_TO_UNITMASK(unitmask); + } + + return (0); + +} + +#endif + +#if defined(__amd64__) || defined(__i386__) + +/* + * Intel Core (Family 6, Model E) PMCs. + */ + +static struct pmc_event_alias core_aliases[] = { + EV_ALIAS("branches", "iap-br-instr-ret"), + EV_ALIAS("branch-mispredicts", "iap-br-mispred-ret"), + EV_ALIAS("cycles", "tsc-tsc"), + EV_ALIAS("ic-misses", "iap-icache-misses"), + EV_ALIAS("instructions", "iap-instr-ret"), + EV_ALIAS("interrupts", "iap-core-hw-int-rx"), + EV_ALIAS("unhalted-cycles", "iap-unhalted-core-cycles"), + EV_ALIAS(NULL, NULL) +}; + +/* + * Intel Core2 (Family 6, Model F), Core2Extreme (Family 6, Model 17H) + * and Atom (Family 6, model 1CH) PMCs. + * + * We map aliases to events on the fixed-function counters if these + * are present. Note that not all CPUs in this family contain fixed-function + * counters. + */ + +static struct pmc_event_alias core2_aliases[] = { + EV_ALIAS("branches", "iap-br-inst-retired.any"), + EV_ALIAS("branch-mispredicts", "iap-br-inst-retired.mispred"), + EV_ALIAS("cycles", "tsc-tsc"), + EV_ALIAS("ic-misses", "iap-l1i-misses"), + EV_ALIAS("instructions", "iaf-instr-retired.any"), + EV_ALIAS("interrupts", "iap-hw-int-rcv"), + EV_ALIAS("unhalted-cycles", "iaf-cpu-clk-unhalted.core"), + EV_ALIAS(NULL, NULL) +}; + +static struct pmc_event_alias core2_aliases_without_iaf[] = { + EV_ALIAS("branches", "iap-br-inst-retired.any"), + EV_ALIAS("branch-mispredicts", "iap-br-inst-retired.mispred"), + EV_ALIAS("cycles", "tsc-tsc"), + EV_ALIAS("ic-misses", "iap-l1i-misses"), + EV_ALIAS("instructions", "iap-inst-retired.any_p"), + EV_ALIAS("interrupts", "iap-hw-int-rcv"), + EV_ALIAS("unhalted-cycles", "iap-cpu-clk-unhalted.core_p"), + EV_ALIAS(NULL, NULL) +}; + +#define atom_aliases core2_aliases +#define atom_aliases_without_iaf core2_aliases_without_iaf +#define corei7_aliases core2_aliases +#define corei7_aliases_without_iaf core2_aliases_without_iaf +#define westmere_aliases core2_aliases +#define westmere_aliases_without_iaf core2_aliases_without_iaf + +#define IAF_KW_OS "os" +#define IAF_KW_USR "usr" +#define IAF_KW_ANYTHREAD "anythread" + +/* + * Parse an event specifier for Intel fixed function counters. + */ +static int +iaf_allocate_pmc(enum pmc_event pe, char *ctrspec, + struct pmc_op_pmcallocate *pmc_config) +{ + char *p; + + (void) pe; + + pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE); + pmc_config->pm_md.pm_iaf.pm_iaf_flags = 0; + + while ((p = strsep(&ctrspec, ",")) != NULL) { + if (KWMATCH(p, IAF_KW_OS)) + pmc_config->pm_caps |= PMC_CAP_SYSTEM; + else if (KWMATCH(p, IAF_KW_USR)) + pmc_config->pm_caps |= PMC_CAP_USER; + else if (KWMATCH(p, IAF_KW_ANYTHREAD)) + pmc_config->pm_md.pm_iaf.pm_iaf_flags |= IAF_ANY; + else + return (-1); + } + + return (0); +} + +/* + * Core/Core2 support. + */ + +#define IAP_KW_AGENT "agent" +#define IAP_KW_ANYTHREAD "anythread" +#define IAP_KW_CACHESTATE "cachestate" +#define IAP_KW_CMASK "cmask" +#define IAP_KW_CORE "core" +#define IAP_KW_EDGE "edge" +#define IAP_KW_INV "inv" +#define IAP_KW_OS "os" +#define IAP_KW_PREFETCH "prefetch" +#define IAP_KW_SNOOPRESPONSE "snoopresponse" +#define IAP_KW_SNOOPTYPE "snooptype" +#define IAP_KW_TRANSITION "trans" +#define IAP_KW_USR "usr" +#define IAP_KW_RSP "rsp" + +static struct pmc_masks iap_core_mask[] = { + PMCMASK(all, (0x3 << 14)), + PMCMASK(this, (0x1 << 14)), + NULLMASK +}; + +static struct pmc_masks iap_agent_mask[] = { + PMCMASK(this, 0), + PMCMASK(any, (0x1 << 13)), + NULLMASK +}; + +static struct pmc_masks iap_prefetch_mask[] = { + PMCMASK(both, (0x3 << 12)), + PMCMASK(only, (0x1 << 12)), + PMCMASK(exclude, 0), + NULLMASK +}; + +static struct pmc_masks iap_cachestate_mask[] = { + PMCMASK(i, (1 << 8)), + PMCMASK(s, (1 << 9)), + PMCMASK(e, (1 << 10)), + PMCMASK(m, (1 << 11)), + NULLMASK +}; + +static struct pmc_masks iap_snoopresponse_mask[] = { + PMCMASK(clean, (1 << 8)), + PMCMASK(hit, (1 << 9)), + PMCMASK(hitm, (1 << 11)), + NULLMASK +}; + +static struct pmc_masks iap_snooptype_mask[] = { + PMCMASK(cmp2s, (1 << 8)), + PMCMASK(cmp2i, (1 << 9)), + NULLMASK +}; + +static struct pmc_masks iap_transition_mask[] = { + PMCMASK(any, 0x00), + PMCMASK(frequency, 0x10), + NULLMASK +}; + +static struct pmc_masks iap_rsp_mask[] = { + PMCMASK(DMND_DATA_RD, (1 << 0)), + PMCMASK(DMND_RFO, (1 << 1)), + PMCMASK(DMND_IFETCH, (1 << 2)), + PMCMASK(WB, (1 << 3)), + PMCMASK(PF_DATA_RD, (1 << 4)), + PMCMASK(PF_RFO, (1 << 5)), + PMCMASK(PF_IFETCH, (1 << 6)), + PMCMASK(OTHER, (1 << 7)), + PMCMASK(UNCORE_HIT, (1 << 8)), + PMCMASK(OTHER_CORE_HIT_SNP, (1 << 9)), + PMCMASK(OTHER_CORE_HITM, (1 << 10)), + PMCMASK(REMOTE_CACHE_FWD, (1 << 12)), + PMCMASK(REMOTE_DRAM, (1 << 13)), + PMCMASK(LOCAL_DRAM, (1 << 14)), + PMCMASK(NON_DRAM, (1 << 15)), + NULLMASK +}; + +static int +iap_allocate_pmc(enum pmc_event pe, char *ctrspec, + struct pmc_op_pmcallocate *pmc_config) +{ + char *e, *p, *q; + uint32_t cachestate, evmask, rsp; + int count, n; + + pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE | + PMC_CAP_QUALIFIER); + pmc_config->pm_md.pm_iap.pm_iap_config = 0; + + cachestate = evmask = rsp = 0; + + /* Parse additional modifiers if present */ + while ((p = strsep(&ctrspec, ",")) != NULL) { + + n = 0; + if (KWPREFIXMATCH(p, IAP_KW_CMASK "=")) { + q = strchr(p, '='); + if (*++q == '\0') /* skip '=' */ + return (-1); + count = strtol(q, &e, 0); + if (e == q || *e != '\0') + return (-1); + pmc_config->pm_caps |= PMC_CAP_THRESHOLD; + pmc_config->pm_md.pm_iap.pm_iap_config |= + IAP_CMASK(count); + } else if (KWMATCH(p, IAP_KW_EDGE)) { + pmc_config->pm_caps |= PMC_CAP_EDGE; + } else if (KWMATCH(p, IAP_KW_INV)) { + pmc_config->pm_caps |= PMC_CAP_INVERT; + } else if (KWMATCH(p, IAP_KW_OS)) { + pmc_config->pm_caps |= PMC_CAP_SYSTEM; + } else if (KWMATCH(p, IAP_KW_USR)) { + pmc_config->pm_caps |= PMC_CAP_USER; + } else if (KWMATCH(p, IAP_KW_ANYTHREAD)) { + pmc_config->pm_md.pm_iap.pm_iap_config |= IAP_ANY; + } else if (KWPREFIXMATCH(p, IAP_KW_CORE "=")) { + n = pmc_parse_mask(iap_core_mask, p, &evmask); + if (n != 1) + return (-1); + } else if (KWPREFIXMATCH(p, IAP_KW_AGENT "=")) { + n = pmc_parse_mask(iap_agent_mask, p, &evmask); + if (n != 1) + return (-1); + } else if (KWPREFIXMATCH(p, IAP_KW_PREFETCH "=")) { + n = pmc_parse_mask(iap_prefetch_mask, p, &evmask); + if (n != 1) + return (-1); + } else if (KWPREFIXMATCH(p, IAP_KW_CACHESTATE "=")) { + n = pmc_parse_mask(iap_cachestate_mask, p, &cachestate); + } else if (cpu_info.pm_cputype == PMC_CPU_INTEL_CORE && + KWPREFIXMATCH(p, IAP_KW_TRANSITION "=")) { + n = pmc_parse_mask(iap_transition_mask, p, &evmask); + if (n != 1) + return (-1); + } else if (cpu_info.pm_cputype == PMC_CPU_INTEL_ATOM || + cpu_info.pm_cputype == PMC_CPU_INTEL_CORE2 || + cpu_info.pm_cputype == PMC_CPU_INTEL_CORE2EXTREME) { + if (KWPREFIXMATCH(p, IAP_KW_SNOOPRESPONSE "=")) { + n = pmc_parse_mask(iap_snoopresponse_mask, p, + &evmask); + } else if (KWPREFIXMATCH(p, IAP_KW_SNOOPTYPE "=")) { + n = pmc_parse_mask(iap_snooptype_mask, p, + &evmask); + } else + return (-1); + } else if (cpu_info.pm_cputype == PMC_CPU_INTEL_COREI7 || + cpu_info.pm_cputype == PMC_CPU_INTEL_WESTMERE) { + if (KWPREFIXMATCH(p, IAP_KW_RSP "=")) { + n = pmc_parse_mask(iap_rsp_mask, p, &rsp); + } else + return (-1); + } else + return (-1); + + if (n < 0) /* Parsing failed. */ + return (-1); + } + + pmc_config->pm_md.pm_iap.pm_iap_config |= evmask; + + /* + * If the event requires a 'cachestate' qualifier but was not + * specified by the user, use a sensible default. + */ + switch (pe) { + case PMC_EV_IAP_EVENT_28H: /* Core, Core2, Atom */ + case PMC_EV_IAP_EVENT_29H: /* Core, Core2, Atom */ + case PMC_EV_IAP_EVENT_2AH: /* Core, Core2, Atom */ + case PMC_EV_IAP_EVENT_2BH: /* Atom, Core2 */ + case PMC_EV_IAP_EVENT_2EH: /* Core, Core2, Atom */ + case PMC_EV_IAP_EVENT_30H: /* Core, Core2, Atom */ + case PMC_EV_IAP_EVENT_32H: /* Core */ + case PMC_EV_IAP_EVENT_40H: /* Core */ + case PMC_EV_IAP_EVENT_41H: /* Core */ + case PMC_EV_IAP_EVENT_42H: /* Core, Core2, Atom */ + if (cachestate == 0) + cachestate = (0xF << 8); + break; + case PMC_EV_IAP_EVENT_77H: /* Atom */ + /* IAP_EVENT_77H only accepts a cachestate qualifier on the + * Atom processor + */ + if(cpu_info.pm_cputype == PMC_CPU_INTEL_ATOM && cachestate == 0) + cachestate = (0xF << 8); + break; + default: + break; + } + + pmc_config->pm_md.pm_iap.pm_iap_config |= cachestate; + pmc_config->pm_md.pm_iap.pm_iap_rsp = rsp; + + return (0); +} + +/* + * Intel Uncore. + */ + +static int +ucf_allocate_pmc(enum pmc_event pe, char *ctrspec, + struct pmc_op_pmcallocate *pmc_config) +{ + (void) pe; + (void) ctrspec; + + pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE); + pmc_config->pm_md.pm_ucf.pm_ucf_flags = 0; + + return (0); +} + +#define UCP_KW_CMASK "cmask" +#define UCP_KW_EDGE "edge" +#define UCP_KW_INV "inv" + +static int +ucp_allocate_pmc(enum pmc_event pe, char *ctrspec, + struct pmc_op_pmcallocate *pmc_config) +{ + char *e, *p, *q; + int count, n; + + (void) pe; + + pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE | + PMC_CAP_QUALIFIER); + pmc_config->pm_md.pm_ucp.pm_ucp_config = 0; + + /* Parse additional modifiers if present */ + while ((p = strsep(&ctrspec, ",")) != NULL) { + + n = 0; + if (KWPREFIXMATCH(p, UCP_KW_CMASK "=")) { + q = strchr(p, '='); + if (*++q == '\0') /* skip '=' */ + return (-1); + count = strtol(q, &e, 0); + if (e == q || *e != '\0') + return (-1); + pmc_config->pm_caps |= PMC_CAP_THRESHOLD; + pmc_config->pm_md.pm_ucp.pm_ucp_config |= + UCP_CMASK(count); + } else if (KWMATCH(p, UCP_KW_EDGE)) { + pmc_config->pm_caps |= PMC_CAP_EDGE; + } else if (KWMATCH(p, UCP_KW_INV)) { + pmc_config->pm_caps |= PMC_CAP_INVERT; + } else + return (-1); + + if (n < 0) /* Parsing failed. */ + return (-1); + } + + return (0); +} + +/* + * AMD K8 PMCs. + * + * These are very similar to AMD K7 PMCs, but support more kinds of + * events. + */ + +static struct pmc_event_alias k8_aliases[] = { + EV_ALIAS("branches", "k8-fr-retired-taken-branches"), + EV_ALIAS("branch-mispredicts", + "k8-fr-retired-taken-branches-mispredicted"), + EV_ALIAS("cycles", "tsc"), + EV_ALIAS("dc-misses", "k8-dc-miss"), + EV_ALIAS("ic-misses", "k8-ic-miss"), + EV_ALIAS("instructions", "k8-fr-retired-x86-instructions"), + EV_ALIAS("interrupts", "k8-fr-taken-hardware-interrupts"), + EV_ALIAS("unhalted-cycles", "k8-bu-cpu-clk-unhalted"), + EV_ALIAS(NULL, NULL) +}; + +#define __K8MASK(N,V) PMCMASK(N,(1 << (V))) + +/* + * Parsing tables + */ + +/* fp dispatched fpu ops */ +static const struct pmc_masks k8_mask_fdfo[] = { + __K8MASK(add-pipe-excluding-junk-ops, 0), + __K8MASK(multiply-pipe-excluding-junk-ops, 1), + __K8MASK(store-pipe-excluding-junk-ops, 2), + __K8MASK(add-pipe-junk-ops, 3), + __K8MASK(multiply-pipe-junk-ops, 4), + __K8MASK(store-pipe-junk-ops, 5), + NULLMASK +}; + +/* ls segment register loads */ +static const struct pmc_masks k8_mask_lsrl[] = { + __K8MASK(es, 0), + __K8MASK(cs, 1), + __K8MASK(ss, 2), + __K8MASK(ds, 3), + __K8MASK(fs, 4), + __K8MASK(gs, 5), + __K8MASK(hs, 6), + NULLMASK +}; + +/* ls locked operation */ +static const struct pmc_masks k8_mask_llo[] = { + __K8MASK(locked-instructions, 0), + __K8MASK(cycles-in-request, 1), + __K8MASK(cycles-to-complete, 2), + NULLMASK +}; + +/* dc refill from {l2,system} and dc copyback */ +static const struct pmc_masks k8_mask_dc[] = { + __K8MASK(invalid, 0), + __K8MASK(shared, 1), + __K8MASK(exclusive, 2), + __K8MASK(owner, 3), + __K8MASK(modified, 4), + NULLMASK +}; + +/* dc one bit ecc error */ +static const struct pmc_masks k8_mask_dobee[] = { + __K8MASK(scrubber, 0), + __K8MASK(piggyback, 1), + NULLMASK +}; + +/* dc dispatched prefetch instructions */ +static const struct pmc_masks k8_mask_ddpi[] = { + __K8MASK(load, 0), + __K8MASK(store, 1), + __K8MASK(nta, 2), + NULLMASK +}; + +/* dc dcache accesses by locks */ +static const struct pmc_masks k8_mask_dabl[] = { + __K8MASK(accesses, 0), + __K8MASK(misses, 1), + NULLMASK +}; + +/* bu internal l2 request */ +static const struct pmc_masks k8_mask_bilr[] = { + __K8MASK(ic-fill, 0), + __K8MASK(dc-fill, 1), + __K8MASK(tlb-reload, 2), + __K8MASK(tag-snoop, 3), + __K8MASK(cancelled, 4), + NULLMASK +}; + +/* bu fill request l2 miss */ +static const struct pmc_masks k8_mask_bfrlm[] = { + __K8MASK(ic-fill, 0), + __K8MASK(dc-fill, 1), + __K8MASK(tlb-reload, 2), + NULLMASK +}; + +/* bu fill into l2 */ +static const struct pmc_masks k8_mask_bfil[] = { + __K8MASK(dirty-l2-victim, 0), + __K8MASK(victim-from-l2, 1), + NULLMASK +}; + +/* fr retired fpu instructions */ +static const struct pmc_masks k8_mask_frfi[] = { + __K8MASK(x87, 0), + __K8MASK(mmx-3dnow, 1), + __K8MASK(packed-sse-sse2, 2), + __K8MASK(scalar-sse-sse2, 3), + NULLMASK +}; + +/* fr retired fastpath double op instructions */ +static const struct pmc_masks k8_mask_frfdoi[] = { + __K8MASK(low-op-pos-0, 0), + __K8MASK(low-op-pos-1, 1), + __K8MASK(low-op-pos-2, 2), + NULLMASK +}; + +/* fr fpu exceptions */ +static const struct pmc_masks k8_mask_ffe[] = { + __K8MASK(x87-reclass-microfaults, 0), + __K8MASK(sse-retype-microfaults, 1), + __K8MASK(sse-reclass-microfaults, 2), + __K8MASK(sse-and-x87-microtraps, 3), + NULLMASK +}; + +/* nb memory controller page access event */ +static const struct pmc_masks k8_mask_nmcpae[] = { + __K8MASK(page-hit, 0), + __K8MASK(page-miss, 1), + __K8MASK(page-conflict, 2), + NULLMASK +}; + +/* nb memory controller turnaround */ +static const struct pmc_masks k8_mask_nmct[] = { + __K8MASK(dimm-turnaround, 0), + __K8MASK(read-to-write-turnaround, 1), + __K8MASK(write-to-read-turnaround, 2), + NULLMASK +}; + +/* nb memory controller bypass saturation */ +static const struct pmc_masks k8_mask_nmcbs[] = { + __K8MASK(memory-controller-hi-pri-bypass, 0), + __K8MASK(memory-controller-lo-pri-bypass, 1), + __K8MASK(dram-controller-interface-bypass, 2), + __K8MASK(dram-controller-queue-bypass, 3), + NULLMASK +}; + +/* nb sized commands */ +static const struct pmc_masks k8_mask_nsc[] = { + __K8MASK(nonpostwrszbyte, 0), + __K8MASK(nonpostwrszdword, 1), + __K8MASK(postwrszbyte, 2), + __K8MASK(postwrszdword, 3), + __K8MASK(rdszbyte, 4), + __K8MASK(rdszdword, 5), + __K8MASK(rdmodwr, 6), + NULLMASK +}; + +/* nb probe result */ +static const struct pmc_masks k8_mask_npr[] = { + __K8MASK(probe-miss, 0), + __K8MASK(probe-hit, 1), + __K8MASK(probe-hit-dirty-no-memory-cancel, 2), + __K8MASK(probe-hit-dirty-with-memory-cancel, 3), + NULLMASK +}; + +/* nb hypertransport bus bandwidth */ +static const struct pmc_masks k8_mask_nhbb[] = { /* HT bus bandwidth */ + __K8MASK(command, 0), + __K8MASK(data, 1), + __K8MASK(buffer-release, 2), + __K8MASK(nop, 3), + NULLMASK +}; + +#undef __K8MASK + +#define K8_KW_COUNT "count" +#define K8_KW_EDGE "edge" +#define K8_KW_INV "inv" +#define K8_KW_MASK "mask" +#define K8_KW_OS "os" +#define K8_KW_USR "usr" + +static int +k8_allocate_pmc(enum pmc_event pe, char *ctrspec, + struct pmc_op_pmcallocate *pmc_config) +{ + char *e, *p, *q; + int n; + uint32_t count, evmask; + const struct pmc_masks *pm, *pmask; + + pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE); + pmc_config->pm_md.pm_amd.pm_amd_config = 0; + + pmask = NULL; + evmask = 0; + +#define __K8SETMASK(M) pmask = k8_mask_##M + + /* setup parsing tables */ + switch (pe) { + case PMC_EV_K8_FP_DISPATCHED_FPU_OPS: + __K8SETMASK(fdfo); + break; + case PMC_EV_K8_LS_SEGMENT_REGISTER_LOAD: + __K8SETMASK(lsrl); + break; + case PMC_EV_K8_LS_LOCKED_OPERATION: + __K8SETMASK(llo); + break; + case PMC_EV_K8_DC_REFILL_FROM_L2: + case PMC_EV_K8_DC_REFILL_FROM_SYSTEM: + case PMC_EV_K8_DC_COPYBACK: + __K8SETMASK(dc); + break; + case PMC_EV_K8_DC_ONE_BIT_ECC_ERROR: + __K8SETMASK(dobee); + break; + case PMC_EV_K8_DC_DISPATCHED_PREFETCH_INSTRUCTIONS: + __K8SETMASK(ddpi); + break; + case PMC_EV_K8_DC_DCACHE_ACCESSES_BY_LOCKS: + __K8SETMASK(dabl); + break; + case PMC_EV_K8_BU_INTERNAL_L2_REQUEST: + __K8SETMASK(bilr); + break; + case PMC_EV_K8_BU_FILL_REQUEST_L2_MISS: + __K8SETMASK(bfrlm); + break; + case PMC_EV_K8_BU_FILL_INTO_L2: + __K8SETMASK(bfil); + break; + case PMC_EV_K8_FR_RETIRED_FPU_INSTRUCTIONS: + __K8SETMASK(frfi); + break; + case PMC_EV_K8_FR_RETIRED_FASTPATH_DOUBLE_OP_INSTRUCTIONS: + __K8SETMASK(frfdoi); + break; + case PMC_EV_K8_FR_FPU_EXCEPTIONS: + __K8SETMASK(ffe); + break; + case PMC_EV_K8_NB_MEMORY_CONTROLLER_PAGE_ACCESS_EVENT: + __K8SETMASK(nmcpae); + break; + case PMC_EV_K8_NB_MEMORY_CONTROLLER_TURNAROUND: + __K8SETMASK(nmct); + break; + case PMC_EV_K8_NB_MEMORY_CONTROLLER_BYPASS_SATURATION: + __K8SETMASK(nmcbs); + break; + case PMC_EV_K8_NB_SIZED_COMMANDS: + __K8SETMASK(nsc); + break; + case PMC_EV_K8_NB_PROBE_RESULT: + __K8SETMASK(npr); + break; + case PMC_EV_K8_NB_HT_BUS0_BANDWIDTH: + case PMC_EV_K8_NB_HT_BUS1_BANDWIDTH: + case PMC_EV_K8_NB_HT_BUS2_BANDWIDTH: + __K8SETMASK(nhbb); + break; + + default: + break; /* no options defined */ + } + + while ((p = strsep(&ctrspec, ",")) != NULL) { + if (KWPREFIXMATCH(p, K8_KW_COUNT "=")) { + q = strchr(p, '='); + if (*++q == '\0') /* skip '=' */ + return (-1); + + count = strtol(q, &e, 0); + if (e == q || *e != '\0') + return (-1); + + pmc_config->pm_caps |= PMC_CAP_THRESHOLD; + pmc_config->pm_md.pm_amd.pm_amd_config |= + AMD_PMC_TO_COUNTER(count); + + } else if (KWMATCH(p, K8_KW_EDGE)) { + pmc_config->pm_caps |= PMC_CAP_EDGE; + } else if (KWMATCH(p, K8_KW_INV)) { + pmc_config->pm_caps |= PMC_CAP_INVERT; + } else if (KWPREFIXMATCH(p, K8_KW_MASK "=")) { + if ((n = pmc_parse_mask(pmask, p, &evmask)) < 0) + return (-1); + pmc_config->pm_caps |= PMC_CAP_QUALIFIER; + } else if (KWMATCH(p, K8_KW_OS)) { + pmc_config->pm_caps |= PMC_CAP_SYSTEM; + } else if (KWMATCH(p, K8_KW_USR)) { + pmc_config->pm_caps |= PMC_CAP_USER; + } else + return (-1); + } + + /* other post processing */ + switch (pe) { + case PMC_EV_K8_FP_DISPATCHED_FPU_OPS: + case PMC_EV_K8_FP_CYCLES_WITH_NO_FPU_OPS_RETIRED: + case PMC_EV_K8_FP_DISPATCHED_FPU_FAST_FLAG_OPS: + case PMC_EV_K8_FR_RETIRED_FASTPATH_DOUBLE_OP_INSTRUCTIONS: + case PMC_EV_K8_FR_RETIRED_FPU_INSTRUCTIONS: + case PMC_EV_K8_FR_FPU_EXCEPTIONS: + /* XXX only available in rev B and later */ + break; + case PMC_EV_K8_DC_DCACHE_ACCESSES_BY_LOCKS: + /* XXX only available in rev C and later */ + break; + case PMC_EV_K8_LS_LOCKED_OPERATION: + /* XXX CPU Rev A,B evmask is to be zero */ + if (evmask & (evmask - 1)) /* > 1 bit set */ + return (-1); + if (evmask == 0) { + evmask = 0x01; /* Rev C and later: #instrs */ + pmc_config->pm_caps |= PMC_CAP_QUALIFIER; + } + break; + default: + if (evmask == 0 && pmask != NULL) { + for (pm = pmask; pm->pm_name; pm++) + evmask |= pm->pm_value; + pmc_config->pm_caps |= PMC_CAP_QUALIFIER; + } + } + + if (pmc_config->pm_caps & PMC_CAP_QUALIFIER) + pmc_config->pm_md.pm_amd.pm_amd_config = + AMD_PMC_TO_UNITMASK(evmask); + + return (0); +} + +#endif + +#if defined(__amd64__) || defined(__i386__) + +/* + * Intel P4 PMCs + */ + +static struct pmc_event_alias p4_aliases[] = { + EV_ALIAS("branches", "p4-branch-retired,mask=mmtp+mmtm"), + EV_ALIAS("branch-mispredicts", "p4-mispred-branch-retired"), + EV_ALIAS("cycles", "tsc"), + EV_ALIAS("instructions", + "p4-instr-retired,mask=nbogusntag+nbogustag"), + EV_ALIAS("unhalted-cycles", "p4-global-power-events"), + EV_ALIAS(NULL, NULL) +}; + +#define P4_KW_ACTIVE "active" +#define P4_KW_ACTIVE_ANY "any" +#define P4_KW_ACTIVE_BOTH "both" +#define P4_KW_ACTIVE_NONE "none" +#define P4_KW_ACTIVE_SINGLE "single" +#define P4_KW_BUSREQTYPE "busreqtype" +#define P4_KW_CASCADE "cascade" +#define P4_KW_EDGE "edge" +#define P4_KW_INV "complement" +#define P4_KW_OS "os" +#define P4_KW_MASK "mask" +#define P4_KW_PRECISE "precise" +#define P4_KW_TAG "tag" +#define P4_KW_THRESHOLD "threshold" +#define P4_KW_USR "usr" + +#define __P4MASK(N,V) PMCMASK(N, (1 << (V))) + +static const struct pmc_masks p4_mask_tcdm[] = { /* tc deliver mode */ + __P4MASK(dd, 0), + __P4MASK(db, 1), + __P4MASK(di, 2), + __P4MASK(bd, 3), + __P4MASK(bb, 4), + __P4MASK(bi, 5), + __P4MASK(id, 6), + __P4MASK(ib, 7), + NULLMASK +}; + +static const struct pmc_masks p4_mask_bfr[] = { /* bpu fetch request */ + __P4MASK(tcmiss, 0), + NULLMASK, +}; + +static const struct pmc_masks p4_mask_ir[] = { /* itlb reference */ + __P4MASK(hit, 0), + __P4MASK(miss, 1), + __P4MASK(hit-uc, 2), + NULLMASK +}; + +static const struct pmc_masks p4_mask_memcan[] = { /* memory cancel */ + __P4MASK(st-rb-full, 2), + __P4MASK(64k-conf, 3), + NULLMASK +}; + +static const struct pmc_masks p4_mask_memcomp[] = { /* memory complete */ + __P4MASK(lsc, 0), + __P4MASK(ssc, 1), + NULLMASK +}; + +static const struct pmc_masks p4_mask_lpr[] = { /* load port replay */ + __P4MASK(split-ld, 1), + NULLMASK +}; + +static const struct pmc_masks p4_mask_spr[] = { /* store port replay */ + __P4MASK(split-st, 1), + NULLMASK +}; + +static const struct pmc_masks p4_mask_mlr[] = { /* mob load replay */ + __P4MASK(no-sta, 1), + __P4MASK(no-std, 3), + __P4MASK(partial-data, 4), + __P4MASK(unalgn-addr, 5), + NULLMASK +}; + +static const struct pmc_masks p4_mask_pwt[] = { /* page walk type */ + __P4MASK(dtmiss, 0), + __P4MASK(itmiss, 1), + NULLMASK +}; + +static const struct pmc_masks p4_mask_bcr[] = { /* bsq cache reference */ + __P4MASK(rd-2ndl-hits, 0), + __P4MASK(rd-2ndl-hite, 1), + __P4MASK(rd-2ndl-hitm, 2), + __P4MASK(rd-3rdl-hits, 3), + __P4MASK(rd-3rdl-hite, 4), + __P4MASK(rd-3rdl-hitm, 5), + __P4MASK(rd-2ndl-miss, 8), + __P4MASK(rd-3rdl-miss, 9), + __P4MASK(wr-2ndl-miss, 10), + NULLMASK +}; + +static const struct pmc_masks p4_mask_ia[] = { /* ioq allocation */ + __P4MASK(all-read, 5), + __P4MASK(all-write, 6), + __P4MASK(mem-uc, 7), + __P4MASK(mem-wc, 8), + __P4MASK(mem-wt, 9), + __P4MASK(mem-wp, 10), + __P4MASK(mem-wb, 11), + __P4MASK(own, 13), + __P4MASK(other, 14), + __P4MASK(prefetch, 15), + NULLMASK +}; + +static const struct pmc_masks p4_mask_iae[] = { /* ioq active entries */ + __P4MASK(all-read, 5), + __P4MASK(all-write, 6), + __P4MASK(mem-uc, 7), + __P4MASK(mem-wc, 8), + __P4MASK(mem-wt, 9), + __P4MASK(mem-wp, 10), + __P4MASK(mem-wb, 11), + __P4MASK(own, 13), + __P4MASK(other, 14), + __P4MASK(prefetch, 15), + NULLMASK +}; + +static const struct pmc_masks p4_mask_fda[] = { /* fsb data activity */ + __P4MASK(drdy-drv, 0), + __P4MASK(drdy-own, 1), + __P4MASK(drdy-other, 2), + __P4MASK(dbsy-drv, 3), + __P4MASK(dbsy-own, 4), + __P4MASK(dbsy-other, 5), + NULLMASK +}; + +static const struct pmc_masks p4_mask_ba[] = { /* bsq allocation */ + __P4MASK(req-type0, 0), + __P4MASK(req-type1, 1), + __P4MASK(req-len0, 2), + __P4MASK(req-len1, 3), + __P4MASK(req-io-type, 5), + __P4MASK(req-lock-type, 6), + __P4MASK(req-cache-type, 7), + __P4MASK(req-split-type, 8), + __P4MASK(req-dem-type, 9), + __P4MASK(req-ord-type, 10), + __P4MASK(mem-type0, 11), + __P4MASK(mem-type1, 12), + __P4MASK(mem-type2, 13), + NULLMASK +}; + +static const struct pmc_masks p4_mask_sia[] = { /* sse input assist */ + __P4MASK(all, 15), + NULLMASK +}; + +static const struct pmc_masks p4_mask_psu[] = { /* packed sp uop */ + __P4MASK(all, 15), + NULLMASK +}; + +static const struct pmc_masks p4_mask_pdu[] = { /* packed dp uop */ + __P4MASK(all, 15), + NULLMASK +}; + +static const struct pmc_masks p4_mask_ssu[] = { /* scalar sp uop */ + __P4MASK(all, 15), + NULLMASK +}; + +static const struct pmc_masks p4_mask_sdu[] = { /* scalar dp uop */ + __P4MASK(all, 15), + NULLMASK +}; + +static const struct pmc_masks p4_mask_64bmu[] = { /* 64 bit mmx uop */ + __P4MASK(all, 15), + NULLMASK +}; + +static const struct pmc_masks p4_mask_128bmu[] = { /* 128 bit mmx uop */ + __P4MASK(all, 15), + NULLMASK +}; + +static const struct pmc_masks p4_mask_xfu[] = { /* X87 fp uop */ + __P4MASK(all, 15), + NULLMASK +}; + +static const struct pmc_masks p4_mask_xsmu[] = { /* x87 simd moves uop */ + __P4MASK(allp0, 3), + __P4MASK(allp2, 4), + NULLMASK +}; + +static const struct pmc_masks p4_mask_gpe[] = { /* global power events */ + __P4MASK(running, 0), + NULLMASK +}; + +static const struct pmc_masks p4_mask_tmx[] = { /* TC ms xfer */ + __P4MASK(cisc, 0), + NULLMASK +}; + +static const struct pmc_masks p4_mask_uqw[] = { /* uop queue writes */ + __P4MASK(from-tc-build, 0), + __P4MASK(from-tc-deliver, 1), + __P4MASK(from-rom, 2), + NULLMASK +}; + +static const struct pmc_masks p4_mask_rmbt[] = { + /* retired mispred branch type */ + __P4MASK(conditional, 1), + __P4MASK(call, 2), + __P4MASK(return, 3), + __P4MASK(indirect, 4), + NULLMASK +}; + +static const struct pmc_masks p4_mask_rbt[] = { /* retired branch type */ + __P4MASK(conditional, 1), + __P4MASK(call, 2), + __P4MASK(retired, 3), + __P4MASK(indirect, 4), + NULLMASK +}; + +static const struct pmc_masks p4_mask_rs[] = { /* resource stall */ + __P4MASK(sbfull, 5), + NULLMASK +}; + +static const struct pmc_masks p4_mask_wb[] = { /* WC buffer */ + __P4MASK(wcb-evicts, 0), + __P4MASK(wcb-full-evict, 1), + NULLMASK +}; + +static const struct pmc_masks p4_mask_fee[] = { /* front end event */ + __P4MASK(nbogus, 0), + __P4MASK(bogus, 1), + NULLMASK +}; + +static const struct pmc_masks p4_mask_ee[] = { /* execution event */ + __P4MASK(nbogus0, 0), + __P4MASK(nbogus1, 1), + __P4MASK(nbogus2, 2), + __P4MASK(nbogus3, 3), + __P4MASK(bogus0, 4), + __P4MASK(bogus1, 5), + __P4MASK(bogus2, 6), + __P4MASK(bogus3, 7), + NULLMASK +}; + +static const struct pmc_masks p4_mask_re[] = { /* replay event */ + __P4MASK(nbogus, 0), + __P4MASK(bogus, 1), + NULLMASK +}; + +static const struct pmc_masks p4_mask_insret[] = { /* instr retired */ + __P4MASK(nbogusntag, 0), + __P4MASK(nbogustag, 1), + __P4MASK(bogusntag, 2), + __P4MASK(bogustag, 3), + NULLMASK +}; + +static const struct pmc_masks p4_mask_ur[] = { /* uops retired */ + __P4MASK(nbogus, 0), + __P4MASK(bogus, 1), + NULLMASK +}; + +static const struct pmc_masks p4_mask_ut[] = { /* uop type */ + __P4MASK(tagloads, 1), + __P4MASK(tagstores, 2), + NULLMASK +}; + +static const struct pmc_masks p4_mask_br[] = { /* branch retired */ + __P4MASK(mmnp, 0), + __P4MASK(mmnm, 1), + __P4MASK(mmtp, 2), + __P4MASK(mmtm, 3), + NULLMASK +}; + +static const struct pmc_masks p4_mask_mbr[] = { /* mispred branch retired */ + __P4MASK(nbogus, 0), + NULLMASK +}; + +static const struct pmc_masks p4_mask_xa[] = { /* x87 assist */ + __P4MASK(fpsu, 0), + __P4MASK(fpso, 1), + __P4MASK(poao, 2), + __P4MASK(poau, 3), + __P4MASK(prea, 4), + NULLMASK +}; + +static const struct pmc_masks p4_mask_machclr[] = { /* machine clear */ + __P4MASK(clear, 0), + __P4MASK(moclear, 2), + __P4MASK(smclear, 3), + NULLMASK +}; + +/* P4 event parser */ +static int +p4_allocate_pmc(enum pmc_event pe, char *ctrspec, + struct pmc_op_pmcallocate *pmc_config) +{ + + char *e, *p, *q; + int count, has_tag, has_busreqtype, n; + uint32_t evmask, cccractivemask; + const struct pmc_masks *pm, *pmask; + + pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE); + pmc_config->pm_md.pm_p4.pm_p4_cccrconfig = + pmc_config->pm_md.pm_p4.pm_p4_escrconfig = 0; + + pmask = NULL; + evmask = 0; + cccractivemask = 0x3; + has_tag = has_busreqtype = 0; + +#define __P4SETMASK(M) do { \ + pmask = p4_mask_##M; \ +} while (0) + + switch (pe) { + case PMC_EV_P4_TC_DELIVER_MODE: + __P4SETMASK(tcdm); + break; + case PMC_EV_P4_BPU_FETCH_REQUEST: + __P4SETMASK(bfr); + break; + case PMC_EV_P4_ITLB_REFERENCE: + __P4SETMASK(ir); + break; + case PMC_EV_P4_MEMORY_CANCEL: + __P4SETMASK(memcan); + break; + case PMC_EV_P4_MEMORY_COMPLETE: + __P4SETMASK(memcomp); + break; + case PMC_EV_P4_LOAD_PORT_REPLAY: + __P4SETMASK(lpr); + break; + case PMC_EV_P4_STORE_PORT_REPLAY: + __P4SETMASK(spr); + break; + case PMC_EV_P4_MOB_LOAD_REPLAY: + __P4SETMASK(mlr); + break; + case PMC_EV_P4_PAGE_WALK_TYPE: + __P4SETMASK(pwt); + break; + case PMC_EV_P4_BSQ_CACHE_REFERENCE: + __P4SETMASK(bcr); + break; + case PMC_EV_P4_IOQ_ALLOCATION: + __P4SETMASK(ia); + has_busreqtype = 1; + break; + case PMC_EV_P4_IOQ_ACTIVE_ENTRIES: + __P4SETMASK(iae); + has_busreqtype = 1; + break; + case PMC_EV_P4_FSB_DATA_ACTIVITY: + __P4SETMASK(fda); + break; + case PMC_EV_P4_BSQ_ALLOCATION: + __P4SETMASK(ba); + break; + case PMC_EV_P4_SSE_INPUT_ASSIST: + __P4SETMASK(sia); + break; + case PMC_EV_P4_PACKED_SP_UOP: + __P4SETMASK(psu); + break; + case PMC_EV_P4_PACKED_DP_UOP: + __P4SETMASK(pdu); + break; + case PMC_EV_P4_SCALAR_SP_UOP: + __P4SETMASK(ssu); + break; + case PMC_EV_P4_SCALAR_DP_UOP: + __P4SETMASK(sdu); + break; + case PMC_EV_P4_64BIT_MMX_UOP: + __P4SETMASK(64bmu); + break; + case PMC_EV_P4_128BIT_MMX_UOP: + __P4SETMASK(128bmu); + break; + case PMC_EV_P4_X87_FP_UOP: + __P4SETMASK(xfu); + break; + case PMC_EV_P4_X87_SIMD_MOVES_UOP: + __P4SETMASK(xsmu); + break; + case PMC_EV_P4_GLOBAL_POWER_EVENTS: + __P4SETMASK(gpe); + break; + case PMC_EV_P4_TC_MS_XFER: + __P4SETMASK(tmx); + break; + case PMC_EV_P4_UOP_QUEUE_WRITES: + __P4SETMASK(uqw); + break; + case PMC_EV_P4_RETIRED_MISPRED_BRANCH_TYPE: + __P4SETMASK(rmbt); + break; + case PMC_EV_P4_RETIRED_BRANCH_TYPE: + __P4SETMASK(rbt); + break; + case PMC_EV_P4_RESOURCE_STALL: + __P4SETMASK(rs); + break; + case PMC_EV_P4_WC_BUFFER: + __P4SETMASK(wb); + break; + case PMC_EV_P4_BSQ_ACTIVE_ENTRIES: + case PMC_EV_P4_B2B_CYCLES: + case PMC_EV_P4_BNR: + case PMC_EV_P4_SNOOP: + case PMC_EV_P4_RESPONSE: + break; + case PMC_EV_P4_FRONT_END_EVENT: + __P4SETMASK(fee); + break; + case PMC_EV_P4_EXECUTION_EVENT: + __P4SETMASK(ee); + break; + case PMC_EV_P4_REPLAY_EVENT: + __P4SETMASK(re); + break; + case PMC_EV_P4_INSTR_RETIRED: + __P4SETMASK(insret); + break; + case PMC_EV_P4_UOPS_RETIRED: + __P4SETMASK(ur); + break; + case PMC_EV_P4_UOP_TYPE: + __P4SETMASK(ut); + break; + case PMC_EV_P4_BRANCH_RETIRED: + __P4SETMASK(br); + break; + case PMC_EV_P4_MISPRED_BRANCH_RETIRED: + __P4SETMASK(mbr); + break; + case PMC_EV_P4_X87_ASSIST: + __P4SETMASK(xa); + break; + case PMC_EV_P4_MACHINE_CLEAR: + __P4SETMASK(machclr); + break; + default: + return (-1); + } + + /* process additional flags */ + while ((p = strsep(&ctrspec, ",")) != NULL) { + if (KWPREFIXMATCH(p, P4_KW_ACTIVE)) { + q = strchr(p, '='); + if (*++q == '\0') /* skip '=' */ + return (-1); + + if (strcasecmp(q, P4_KW_ACTIVE_NONE) == 0) + cccractivemask = 0x0; + else if (strcasecmp(q, P4_KW_ACTIVE_SINGLE) == 0) + cccractivemask = 0x1; + else if (strcasecmp(q, P4_KW_ACTIVE_BOTH) == 0) + cccractivemask = 0x2; + else if (strcasecmp(q, P4_KW_ACTIVE_ANY) == 0) + cccractivemask = 0x3; + else + return (-1); + + } else if (KWPREFIXMATCH(p, P4_KW_BUSREQTYPE)) { + if (has_busreqtype == 0) + return (-1); + + q = strchr(p, '='); + if (*++q == '\0') /* skip '=' */ + return (-1); + + count = strtol(q, &e, 0); + if (e == q || *e != '\0') + return (-1); + evmask = (evmask & ~0x1F) | (count & 0x1F); + } else if (KWMATCH(p, P4_KW_CASCADE)) + pmc_config->pm_caps |= PMC_CAP_CASCADE; + else if (KWMATCH(p, P4_KW_EDGE)) + pmc_config->pm_caps |= PMC_CAP_EDGE; + else if (KWMATCH(p, P4_KW_INV)) + pmc_config->pm_caps |= PMC_CAP_INVERT; + else if (KWPREFIXMATCH(p, P4_KW_MASK "=")) { + if ((n = pmc_parse_mask(pmask, p, &evmask)) < 0) + return (-1); + pmc_config->pm_caps |= PMC_CAP_QUALIFIER; + } else if (KWMATCH(p, P4_KW_OS)) + pmc_config->pm_caps |= PMC_CAP_SYSTEM; + else if (KWMATCH(p, P4_KW_PRECISE)) + pmc_config->pm_caps |= PMC_CAP_PRECISE; + else if (KWPREFIXMATCH(p, P4_KW_TAG "=")) { + if (has_tag == 0) + return (-1); + + q = strchr(p, '='); + if (*++q == '\0') /* skip '=' */ + return (-1); + + count = strtol(q, &e, 0); + if (e == q || *e != '\0') + return (-1); + + pmc_config->pm_caps |= PMC_CAP_TAGGING; + pmc_config->pm_md.pm_p4.pm_p4_escrconfig |= + P4_ESCR_TO_TAG_VALUE(count); + } else if (KWPREFIXMATCH(p, P4_KW_THRESHOLD "=")) { + q = strchr(p, '='); + if (*++q == '\0') /* skip '=' */ + return (-1); + + count = strtol(q, &e, 0); + if (e == q || *e != '\0') + return (-1); + + pmc_config->pm_caps |= PMC_CAP_THRESHOLD; + pmc_config->pm_md.pm_p4.pm_p4_cccrconfig &= + ~P4_CCCR_THRESHOLD_MASK; + pmc_config->pm_md.pm_p4.pm_p4_cccrconfig |= + P4_CCCR_TO_THRESHOLD(count); + } else if (KWMATCH(p, P4_KW_USR)) + pmc_config->pm_caps |= PMC_CAP_USER; + else + return (-1); + } + + /* other post processing */ + if (pe == PMC_EV_P4_IOQ_ALLOCATION || + pe == PMC_EV_P4_FSB_DATA_ACTIVITY || + pe == PMC_EV_P4_BSQ_ALLOCATION) + pmc_config->pm_caps |= PMC_CAP_EDGE; + + /* fill in thread activity mask */ + pmc_config->pm_md.pm_p4.pm_p4_cccrconfig |= + P4_CCCR_TO_ACTIVE_THREAD(cccractivemask); + + if (evmask) + pmc_config->pm_caps |= PMC_CAP_QUALIFIER; + + switch (pe) { + case PMC_EV_P4_FSB_DATA_ACTIVITY: + if ((evmask & 0x06) == 0x06 || + (evmask & 0x18) == 0x18) + return (-1); /* can't have own+other bits together */ + if (evmask == 0) /* default:drdy-{drv,own}+dbsy{drv,own} */ + evmask = 0x1D; + break; + case PMC_EV_P4_MACHINE_CLEAR: + /* only one bit is allowed to be set */ + if ((evmask & (evmask - 1)) != 0) + return (-1); + if (evmask == 0) { + evmask = 0x1; /* 'CLEAR' */ + pmc_config->pm_caps |= PMC_CAP_QUALIFIER; + } + break; + default: + if (evmask == 0 && pmask) { + for (pm = pmask; pm->pm_name; pm++) + evmask |= pm->pm_value; + pmc_config->pm_caps |= PMC_CAP_QUALIFIER; + } + } + + pmc_config->pm_md.pm_p4.pm_p4_escrconfig = + P4_ESCR_TO_EVENT_MASK(evmask); + + return (0); +} + +#endif + +#if defined(__i386__) + +/* + * Pentium style PMCs + */ + +static struct pmc_event_alias p5_aliases[] = { + EV_ALIAS("branches", "p5-taken-branches"), + EV_ALIAS("cycles", "tsc"), + EV_ALIAS("dc-misses", "p5-data-read-miss-or-write-miss"), + EV_ALIAS("ic-misses", "p5-code-cache-miss"), + EV_ALIAS("instructions", "p5-instructions-executed"), + EV_ALIAS("interrupts", "p5-hardware-interrupts"), + EV_ALIAS("unhalted-cycles", + "p5-number-of-cycles-not-in-halt-state"), + EV_ALIAS(NULL, NULL) +}; + +static int +p5_allocate_pmc(enum pmc_event pe, char *ctrspec, + struct pmc_op_pmcallocate *pmc_config) +{ + return (-1 || pe || ctrspec || pmc_config); /* shut up gcc */ +} + +/* + * Pentium Pro style PMCs. These PMCs are found in Pentium II, Pentium III, + * and Pentium M CPUs. + */ + +static struct pmc_event_alias p6_aliases[] = { + EV_ALIAS("branches", "p6-br-inst-retired"), + EV_ALIAS("branch-mispredicts", "p6-br-miss-pred-retired"), + EV_ALIAS("cycles", "tsc"), + EV_ALIAS("dc-misses", "p6-dcu-lines-in"), + EV_ALIAS("ic-misses", "p6-ifu-fetch-miss"), + EV_ALIAS("instructions", "p6-inst-retired"), + EV_ALIAS("interrupts", "p6-hw-int-rx"), + EV_ALIAS("unhalted-cycles", "p6-cpu-clk-unhalted"), + EV_ALIAS(NULL, NULL) +}; + +#define P6_KW_CMASK "cmask" +#define P6_KW_EDGE "edge" +#define P6_KW_INV "inv" +#define P6_KW_OS "os" +#define P6_KW_UMASK "umask" +#define P6_KW_USR "usr" + +static struct pmc_masks p6_mask_mesi[] = { + PMCMASK(m, 0x01), + PMCMASK(e, 0x02), + PMCMASK(s, 0x04), + PMCMASK(i, 0x08), + NULLMASK +}; + +static struct pmc_masks p6_mask_mesihw[] = { + PMCMASK(m, 0x01), + PMCMASK(e, 0x02), + PMCMASK(s, 0x04), + PMCMASK(i, 0x08), + PMCMASK(nonhw, 0x00), + PMCMASK(hw, 0x10), + PMCMASK(both, 0x30), + NULLMASK +}; + +static struct pmc_masks p6_mask_hw[] = { + PMCMASK(nonhw, 0x00), + PMCMASK(hw, 0x10), + PMCMASK(both, 0x30), + NULLMASK +}; + +static struct pmc_masks p6_mask_any[] = { + PMCMASK(self, 0x00), + PMCMASK(any, 0x20), + NULLMASK +}; + +static struct pmc_masks p6_mask_ekp[] = { + PMCMASK(nta, 0x00), + PMCMASK(t1, 0x01), + PMCMASK(t2, 0x02), + PMCMASK(wos, 0x03), + NULLMASK +}; + +static struct pmc_masks p6_mask_pps[] = { + PMCMASK(packed-and-scalar, 0x00), + PMCMASK(scalar, 0x01), + NULLMASK +}; + +static struct pmc_masks p6_mask_mite[] = { + PMCMASK(packed-multiply, 0x01), + PMCMASK(packed-shift, 0x02), + PMCMASK(pack, 0x04), + PMCMASK(unpack, 0x08), + PMCMASK(packed-logical, 0x10), + PMCMASK(packed-arithmetic, 0x20), + NULLMASK +}; + +static struct pmc_masks p6_mask_fmt[] = { + PMCMASK(mmxtofp, 0x00), + PMCMASK(fptommx, 0x01), + NULLMASK +}; + +static struct pmc_masks p6_mask_sr[] = { + PMCMASK(es, 0x01), + PMCMASK(ds, 0x02), + PMCMASK(fs, 0x04), + PMCMASK(gs, 0x08), + NULLMASK +}; + +static struct pmc_masks p6_mask_eet[] = { + PMCMASK(all, 0x00), + PMCMASK(freq, 0x02), + NULLMASK +}; + +static struct pmc_masks p6_mask_efur[] = { + PMCMASK(all, 0x00), + PMCMASK(loadop, 0x01), + PMCMASK(stdsta, 0x02), + NULLMASK +}; + +static struct pmc_masks p6_mask_essir[] = { + PMCMASK(sse-packed-single, 0x00), + PMCMASK(sse-packed-single-scalar-single, 0x01), + PMCMASK(sse2-packed-double, 0x02), + PMCMASK(sse2-scalar-double, 0x03), + NULLMASK +}; + +static struct pmc_masks p6_mask_esscir[] = { + PMCMASK(sse-packed-single, 0x00), + PMCMASK(sse-scalar-single, 0x01), + PMCMASK(sse2-packed-double, 0x02), + PMCMASK(sse2-scalar-double, 0x03), + NULLMASK +}; + +/* P6 event parser */ +static int +p6_allocate_pmc(enum pmc_event pe, char *ctrspec, + struct pmc_op_pmcallocate *pmc_config) +{ + char *e, *p, *q; + uint32_t evmask; + int count, n; + const struct pmc_masks *pm, *pmask; + + pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE); + pmc_config->pm_md.pm_ppro.pm_ppro_config = 0; + + evmask = 0; + +#define P6MASKSET(M) pmask = p6_mask_ ## M + + switch(pe) { + case PMC_EV_P6_L2_IFETCH: P6MASKSET(mesi); break; + case PMC_EV_P6_L2_LD: P6MASKSET(mesi); break; + case PMC_EV_P6_L2_ST: P6MASKSET(mesi); break; + case PMC_EV_P6_L2_RQSTS: P6MASKSET(mesi); break; + case PMC_EV_P6_BUS_DRDY_CLOCKS: + case PMC_EV_P6_BUS_LOCK_CLOCKS: + case PMC_EV_P6_BUS_TRAN_BRD: + case PMC_EV_P6_BUS_TRAN_RFO: + case PMC_EV_P6_BUS_TRANS_WB: + case PMC_EV_P6_BUS_TRAN_IFETCH: + case PMC_EV_P6_BUS_TRAN_INVAL: + case PMC_EV_P6_BUS_TRAN_PWR: + case PMC_EV_P6_BUS_TRANS_P: + case PMC_EV_P6_BUS_TRANS_IO: + case PMC_EV_P6_BUS_TRAN_DEF: + case PMC_EV_P6_BUS_TRAN_BURST: + case PMC_EV_P6_BUS_TRAN_ANY: + case PMC_EV_P6_BUS_TRAN_MEM: + P6MASKSET(any); break; + case PMC_EV_P6_EMON_KNI_PREF_DISPATCHED: + case PMC_EV_P6_EMON_KNI_PREF_MISS: + P6MASKSET(ekp); break; + case PMC_EV_P6_EMON_KNI_INST_RETIRED: + case PMC_EV_P6_EMON_KNI_COMP_INST_RET: + P6MASKSET(pps); break; + case PMC_EV_P6_MMX_INSTR_TYPE_EXEC: + P6MASKSET(mite); break; + case PMC_EV_P6_FP_MMX_TRANS: + P6MASKSET(fmt); break; + case PMC_EV_P6_SEG_RENAME_STALLS: + case PMC_EV_P6_SEG_REG_RENAMES: + P6MASKSET(sr); break; + case PMC_EV_P6_EMON_EST_TRANS: + P6MASKSET(eet); break; + case PMC_EV_P6_EMON_FUSED_UOPS_RET: + P6MASKSET(efur); break; + case PMC_EV_P6_EMON_SSE_SSE2_INST_RETIRED: + P6MASKSET(essir); break; + case PMC_EV_P6_EMON_SSE_SSE2_COMP_INST_RETIRED: + P6MASKSET(esscir); break; + default: + pmask = NULL; + break; + } + + /* Pentium M PMCs have a few events with different semantics */ + if (cpu_info.pm_cputype == PMC_CPU_INTEL_PM) { + if (pe == PMC_EV_P6_L2_LD || + pe == PMC_EV_P6_L2_LINES_IN || + pe == PMC_EV_P6_L2_LINES_OUT) + P6MASKSET(mesihw); + else if (pe == PMC_EV_P6_L2_M_LINES_OUTM) + P6MASKSET(hw); + } + + /* Parse additional modifiers if present */ + while ((p = strsep(&ctrspec, ",")) != NULL) { + if (KWPREFIXMATCH(p, P6_KW_CMASK "=")) { + q = strchr(p, '='); + if (*++q == '\0') /* skip '=' */ + return (-1); + count = strtol(q, &e, 0); + if (e == q || *e != '\0') + return (-1); + pmc_config->pm_caps |= PMC_CAP_THRESHOLD; + pmc_config->pm_md.pm_ppro.pm_ppro_config |= + P6_EVSEL_TO_CMASK(count); + } else if (KWMATCH(p, P6_KW_EDGE)) { + pmc_config->pm_caps |= PMC_CAP_EDGE; + } else if (KWMATCH(p, P6_KW_INV)) { + pmc_config->pm_caps |= PMC_CAP_INVERT; + } else if (KWMATCH(p, P6_KW_OS)) { + pmc_config->pm_caps |= PMC_CAP_SYSTEM; + } else if (KWPREFIXMATCH(p, P6_KW_UMASK "=")) { + evmask = 0; + if ((n = pmc_parse_mask(pmask, p, &evmask)) < 0) + return (-1); + if ((pe == PMC_EV_P6_BUS_DRDY_CLOCKS || + pe == PMC_EV_P6_BUS_LOCK_CLOCKS || + pe == PMC_EV_P6_BUS_TRAN_BRD || + pe == PMC_EV_P6_BUS_TRAN_RFO || + pe == PMC_EV_P6_BUS_TRAN_IFETCH || + pe == PMC_EV_P6_BUS_TRAN_INVAL || + pe == PMC_EV_P6_BUS_TRAN_PWR || + pe == PMC_EV_P6_BUS_TRAN_DEF || + pe == PMC_EV_P6_BUS_TRAN_BURST || + pe == PMC_EV_P6_BUS_TRAN_ANY || + pe == PMC_EV_P6_BUS_TRAN_MEM || + pe == PMC_EV_P6_BUS_TRANS_IO || + pe == PMC_EV_P6_BUS_TRANS_P || + pe == PMC_EV_P6_BUS_TRANS_WB || + pe == PMC_EV_P6_EMON_EST_TRANS || + pe == PMC_EV_P6_EMON_FUSED_UOPS_RET || + pe == PMC_EV_P6_EMON_KNI_COMP_INST_RET || + pe == PMC_EV_P6_EMON_KNI_INST_RETIRED || + pe == PMC_EV_P6_EMON_KNI_PREF_DISPATCHED || + pe == PMC_EV_P6_EMON_KNI_PREF_MISS || + pe == PMC_EV_P6_EMON_SSE_SSE2_COMP_INST_RETIRED || + pe == PMC_EV_P6_EMON_SSE_SSE2_INST_RETIRED || + pe == PMC_EV_P6_FP_MMX_TRANS) + && (n > 1)) /* Only one mask keyword is allowed. */ + return (-1); + pmc_config->pm_caps |= PMC_CAP_QUALIFIER; + } else if (KWMATCH(p, P6_KW_USR)) { + pmc_config->pm_caps |= PMC_CAP_USER; + } else + return (-1); + } + + /* post processing */ + switch (pe) { + + /* + * The following events default to an evmask of 0 + */ + + /* default => 'self' */ + case PMC_EV_P6_BUS_DRDY_CLOCKS: + case PMC_EV_P6_BUS_LOCK_CLOCKS: + case PMC_EV_P6_BUS_TRAN_BRD: + case PMC_EV_P6_BUS_TRAN_RFO: + case PMC_EV_P6_BUS_TRANS_WB: + case PMC_EV_P6_BUS_TRAN_IFETCH: + case PMC_EV_P6_BUS_TRAN_INVAL: + case PMC_EV_P6_BUS_TRAN_PWR: + case PMC_EV_P6_BUS_TRANS_P: + case PMC_EV_P6_BUS_TRANS_IO: + case PMC_EV_P6_BUS_TRAN_DEF: + case PMC_EV_P6_BUS_TRAN_BURST: + case PMC_EV_P6_BUS_TRAN_ANY: + case PMC_EV_P6_BUS_TRAN_MEM: + + /* default => 'nta' */ + case PMC_EV_P6_EMON_KNI_PREF_DISPATCHED: + case PMC_EV_P6_EMON_KNI_PREF_MISS: + + /* default => 'packed and scalar' */ + case PMC_EV_P6_EMON_KNI_INST_RETIRED: + case PMC_EV_P6_EMON_KNI_COMP_INST_RET: + + /* default => 'mmx to fp transitions' */ + case PMC_EV_P6_FP_MMX_TRANS: + + /* default => 'SSE Packed Single' */ + case PMC_EV_P6_EMON_SSE_SSE2_INST_RETIRED: + case PMC_EV_P6_EMON_SSE_SSE2_COMP_INST_RETIRED: + + /* default => 'all fused micro-ops' */ + case PMC_EV_P6_EMON_FUSED_UOPS_RET: + + /* default => 'all transitions' */ + case PMC_EV_P6_EMON_EST_TRANS: + break; + + case PMC_EV_P6_MMX_UOPS_EXEC: + evmask = 0x0F; /* only value allowed */ + break; + + default: + /* + * For all other events, set the default event mask + * to a logical OR of all the allowed event mask bits. + */ + if (evmask == 0 && pmask) { + for (pm = pmask; pm->pm_name; pm++) + evmask |= pm->pm_value; + pmc_config->pm_caps |= PMC_CAP_QUALIFIER; + } + + break; + } + + if (pmc_config->pm_caps & PMC_CAP_QUALIFIER) + pmc_config->pm_md.pm_ppro.pm_ppro_config |= + P6_EVSEL_TO_UMASK(evmask); + + return (0); +} + +#endif + +#if defined(__i386__) || defined(__amd64__) +static int +tsc_allocate_pmc(enum pmc_event pe, char *ctrspec, + struct pmc_op_pmcallocate *pmc_config) +{ + if (pe != PMC_EV_TSC_TSC) + return (-1); + + /* TSC events must be unqualified. */ + if (ctrspec && *ctrspec != '\0') + return (-1); + + pmc_config->pm_md.pm_amd.pm_amd_config = 0; + pmc_config->pm_caps |= PMC_CAP_READ; + + return (0); +} +#endif + +#if defined(__XSCALE__) + +static struct pmc_event_alias xscale_aliases[] = { + EV_ALIAS("branches", "BRANCH_RETIRED"), + EV_ALIAS("branch-mispredicts", "BRANCH_MISPRED"), + EV_ALIAS("dc-misses", "DC_MISS"), + EV_ALIAS("ic-misses", "IC_MISS"), + EV_ALIAS("instructions", "INSTR_RETIRED"), + EV_ALIAS(NULL, NULL) +}; +static int +xscale_allocate_pmc(enum pmc_event pe, char *ctrspec __unused, + struct pmc_op_pmcallocate *pmc_config __unused) +{ + switch (pe) { + default: + break; + } + + return (0); +} +#endif + +#if defined(__mips__) + +static struct pmc_event_alias mips24k_aliases[] = { + EV_ALIAS("instructions", "INSTR_EXECUTED"), + EV_ALIAS("branches", "BRANCH_COMPLETED"), + EV_ALIAS("branch-mispredicts", "BRANCH_MISPRED"), + EV_ALIAS(NULL, NULL) +}; + +#define MIPS24K_KW_OS "os" +#define MIPS24K_KW_USR "usr" +#define MIPS24K_KW_ANYTHREAD "anythread" + +static int +mips24k_allocate_pmc(enum pmc_event pe, char *ctrspec __unused, + struct pmc_op_pmcallocate *pmc_config __unused) +{ + char *p; + + (void) pe; + + pmc_config->pm_caps |= (PMC_CAP_READ | PMC_CAP_WRITE); + + while ((p = strsep(&ctrspec, ",")) != NULL) { + if (KWMATCH(p, MIPS24K_KW_OS)) + pmc_config->pm_caps |= PMC_CAP_SYSTEM; + else if (KWMATCH(p, MIPS24K_KW_USR)) + pmc_config->pm_caps |= PMC_CAP_USER; + else if (KWMATCH(p, MIPS24K_KW_ANYTHREAD)) + pmc_config->pm_caps |= (PMC_CAP_USER | PMC_CAP_SYSTEM); + else + return (-1); + } + + return (0); +} +#endif /* __mips__ */ + + +/* + * Match an event name `name' with its canonical form. + * + * Matches are case insensitive and spaces, periods, underscores and + * hyphen characters are considered to match each other. + * + * Returns 1 for a match, 0 otherwise. + */ + +static int +pmc_match_event_name(const char *name, const char *canonicalname) +{ + int cc, nc; + const unsigned char *c, *n; + + c = (const unsigned char *) canonicalname; + n = (const unsigned char *) name; + + for (; (nc = *n) && (cc = *c); n++, c++) { + + if ((nc == ' ' || nc == '_' || nc == '-' || nc == '.') && + (cc == ' ' || cc == '_' || cc == '-' || cc == '.')) + continue; + + if (toupper(nc) == toupper(cc)) + continue; + + + return (0); + } + + if (*n == '\0' && *c == '\0') + return (1); + + return (0); +} + +/* + * Match an event name against all the event named supported by a + * PMC class. + * + * Returns an event descriptor pointer on match or NULL otherwise. + */ +static const struct pmc_event_descr * +pmc_match_event_class(const char *name, + const struct pmc_class_descr *pcd) +{ + size_t n; + const struct pmc_event_descr *ev; + + ev = pcd->pm_evc_event_table; + for (n = 0; n < pcd->pm_evc_event_table_size; n++, ev++) + if (pmc_match_event_name(name, ev->pm_ev_name)) + return (ev); + + return (NULL); +} + +static int +pmc_mdep_is_compatible_class(enum pmc_class pc) +{ + size_t n; + + for (n = 0; n < pmc_mdep_class_list_size; n++) + if (pmc_mdep_class_list[n] == pc) + return (1); + return (0); +} + +/* + * API entry points + */ + +int +pmc_allocate(const char *ctrspec, enum pmc_mode mode, + uint32_t flags, int cpu, pmc_id_t *pmcid) +{ + size_t n; + int retval; + char *r, *spec_copy; + const char *ctrname; + const struct pmc_event_descr *ev; + const struct pmc_event_alias *alias; + struct pmc_op_pmcallocate pmc_config; + const struct pmc_class_descr *pcd; + + spec_copy = NULL; + retval = -1; + + if (mode != PMC_MODE_SS && mode != PMC_MODE_TS && + mode != PMC_MODE_SC && mode != PMC_MODE_TC) { + errno = EINVAL; + goto out; + } + + /* replace an event alias with the canonical event specifier */ + if (pmc_mdep_event_aliases) + for (alias = pmc_mdep_event_aliases; alias->pm_alias; alias++) + if (!strcasecmp(ctrspec, alias->pm_alias)) { + spec_copy = strdup(alias->pm_spec); + break; + } + + if (spec_copy == NULL) + spec_copy = strdup(ctrspec); + + r = spec_copy; + ctrname = strsep(&r, ","); + + /* + * If a explicit class prefix was given by the user, restrict the + * search for the event to the specified PMC class. + */ + ev = NULL; + for (n = 0; n < PMC_CLASS_TABLE_SIZE; n++) { + pcd = pmc_class_table[n]; + if (pmc_mdep_is_compatible_class(pcd->pm_evc_class) && + strncasecmp(ctrname, pcd->pm_evc_name, + pcd->pm_evc_name_size) == 0) { + if ((ev = pmc_match_event_class(ctrname + + pcd->pm_evc_name_size, pcd)) == NULL) { + errno = EINVAL; + goto out; + } + break; + } + } + + /* + * Otherwise, search for this event in all compatible PMC + * classes. + */ + for (n = 0; ev == NULL && n < PMC_CLASS_TABLE_SIZE; n++) { + pcd = pmc_class_table[n]; + if (pmc_mdep_is_compatible_class(pcd->pm_evc_class)) + ev = pmc_match_event_class(ctrname, pcd); + } + + if (ev == NULL) { + errno = EINVAL; + goto out; + } + + bzero(&pmc_config, sizeof(pmc_config)); + pmc_config.pm_ev = ev->pm_ev_code; + pmc_config.pm_class = pcd->pm_evc_class; + pmc_config.pm_cpu = cpu; + pmc_config.pm_mode = mode; + pmc_config.pm_flags = flags; + + if (PMC_IS_SAMPLING_MODE(mode)) + pmc_config.pm_caps |= PMC_CAP_INTERRUPT; + + if (pcd->pm_evc_allocate_pmc(ev->pm_ev_code, r, &pmc_config) < 0) { + errno = EINVAL; + goto out; + } + + if (PMC_CALL(PMCALLOCATE, &pmc_config) < 0) + goto out; + + *pmcid = pmc_config.pm_pmcid; + + retval = 0; + + out: + if (spec_copy) + free(spec_copy); + + return (retval); +} + +int +pmc_attach(pmc_id_t pmc, pid_t pid) +{ + struct pmc_op_pmcattach pmc_attach_args; + + pmc_attach_args.pm_pmc = pmc; + pmc_attach_args.pm_pid = pid; + + return (PMC_CALL(PMCATTACH, &pmc_attach_args)); +} + +int +pmc_capabilities(pmc_id_t pmcid, uint32_t *caps) +{ + unsigned int i; + enum pmc_class cl; + + cl = PMC_ID_TO_CLASS(pmcid); + for (i = 0; i < cpu_info.pm_nclass; i++) + if (cpu_info.pm_classes[i].pm_class == cl) { + *caps = cpu_info.pm_classes[i].pm_caps; + return (0); + } + errno = EINVAL; + return (-1); +} + +int +pmc_configure_logfile(int fd) +{ + struct pmc_op_configurelog cla; + + cla.pm_logfd = fd; + if (PMC_CALL(CONFIGURELOG, &cla) < 0) + return (-1); + return (0); +} + +int +pmc_cpuinfo(const struct pmc_cpuinfo **pci) +{ + if (pmc_syscall == -1) { + errno = ENXIO; + return (-1); + } + + *pci = &cpu_info; + return (0); +} + +int +pmc_detach(pmc_id_t pmc, pid_t pid) +{ + struct pmc_op_pmcattach pmc_detach_args; + + pmc_detach_args.pm_pmc = pmc; + pmc_detach_args.pm_pid = pid; + return (PMC_CALL(PMCDETACH, &pmc_detach_args)); +} + +int +pmc_disable(int cpu, int pmc) +{ + struct pmc_op_pmcadmin ssa; + + ssa.pm_cpu = cpu; + ssa.pm_pmc = pmc; + ssa.pm_state = PMC_STATE_DISABLED; + return (PMC_CALL(PMCADMIN, &ssa)); +} + +int +pmc_enable(int cpu, int pmc) +{ + struct pmc_op_pmcadmin ssa; + + ssa.pm_cpu = cpu; + ssa.pm_pmc = pmc; + ssa.pm_state = PMC_STATE_FREE; + return (PMC_CALL(PMCADMIN, &ssa)); +} + +/* + * Return a list of events known to a given PMC class. 'cl' is the + * PMC class identifier, 'eventnames' is the returned list of 'const + * char *' pointers pointing to the names of the events. 'nevents' is + * the number of event name pointers returned. + * + * The space for 'eventnames' is allocated using malloc(3). The caller + * is responsible for freeing this space when done. + */ +int +pmc_event_names_of_class(enum pmc_class cl, const char ***eventnames, + int *nevents) +{ + int count; + const char **names; + const struct pmc_event_descr *ev; + + switch (cl) + { + case PMC_CLASS_IAF: + ev = iaf_event_table; + count = PMC_EVENT_TABLE_SIZE(iaf); + break; + case PMC_CLASS_IAP: + /* + * Return the most appropriate set of event name + * spellings for the current CPU. + */ + switch (cpu_info.pm_cputype) { + default: + case PMC_CPU_INTEL_ATOM: + ev = atom_event_table; + count = PMC_EVENT_TABLE_SIZE(atom); + break; + case PMC_CPU_INTEL_CORE: + ev = core_event_table; + count = PMC_EVENT_TABLE_SIZE(core); + break; + case PMC_CPU_INTEL_CORE2: + case PMC_CPU_INTEL_CORE2EXTREME: + ev = core2_event_table; + count = PMC_EVENT_TABLE_SIZE(core2); + break; + case PMC_CPU_INTEL_COREI7: + ev = corei7_event_table; + count = PMC_EVENT_TABLE_SIZE(corei7); + break; + case PMC_CPU_INTEL_WESTMERE: + ev = westmere_event_table; + count = PMC_EVENT_TABLE_SIZE(westmere); + break; + } + break; + case PMC_CLASS_UCF: + ev = ucf_event_table; + count = PMC_EVENT_TABLE_SIZE(ucf); + break; + case PMC_CLASS_UCP: + /* + * Return the most appropriate set of event name + * spellings for the current CPU. + */ + switch (cpu_info.pm_cputype) { + default: + case PMC_CPU_INTEL_COREI7: + ev = corei7uc_event_table; + count = PMC_EVENT_TABLE_SIZE(corei7uc); + break; + case PMC_CPU_INTEL_WESTMERE: + ev = westmereuc_event_table; + count = PMC_EVENT_TABLE_SIZE(westmereuc); + break; + } + break; + case PMC_CLASS_TSC: + ev = tsc_event_table; + count = PMC_EVENT_TABLE_SIZE(tsc); + break; + case PMC_CLASS_K7: + ev = k7_event_table; + count = PMC_EVENT_TABLE_SIZE(k7); + break; + case PMC_CLASS_K8: + ev = k8_event_table; + count = PMC_EVENT_TABLE_SIZE(k8); + break; + case PMC_CLASS_P4: + ev = p4_event_table; + count = PMC_EVENT_TABLE_SIZE(p4); + break; + case PMC_CLASS_P5: + ev = p5_event_table; + count = PMC_EVENT_TABLE_SIZE(p5); + break; + case PMC_CLASS_P6: + ev = p6_event_table; + count = PMC_EVENT_TABLE_SIZE(p6); + break; + case PMC_CLASS_XSCALE: + ev = xscale_event_table; + count = PMC_EVENT_TABLE_SIZE(xscale); + break; + case PMC_CLASS_MIPS24K: + ev = mips24k_event_table; + count = PMC_EVENT_TABLE_SIZE(mips24k); + break; + default: + errno = EINVAL; + return (-1); + } + + if ((names = malloc(count * sizeof(const char *))) == NULL) + return (-1); + + *eventnames = names; + *nevents = count; + + for (;count--; ev++, names++) + *names = ev->pm_ev_name; + return (0); +} + +int +pmc_flush_logfile(void) +{ + return (PMC_CALL(FLUSHLOG,0)); +} + +int +pmc_get_driver_stats(struct pmc_driverstats *ds) +{ + struct pmc_op_getdriverstats gms; + + if (PMC_CALL(GETDRIVERSTATS, &gms) < 0) + return (-1); + + /* copy out fields in the current userland<->library interface */ + ds->pm_intr_ignored = gms.pm_intr_ignored; + ds->pm_intr_processed = gms.pm_intr_processed; + ds->pm_intr_bufferfull = gms.pm_intr_bufferfull; + ds->pm_syscalls = gms.pm_syscalls; + ds->pm_syscall_errors = gms.pm_syscall_errors; + ds->pm_buffer_requests = gms.pm_buffer_requests; + ds->pm_buffer_requests_failed = gms.pm_buffer_requests_failed; + ds->pm_log_sweeps = gms.pm_log_sweeps; + return (0); +} + +int +pmc_get_msr(pmc_id_t pmc, uint32_t *msr) +{ + struct pmc_op_getmsr gm; + + gm.pm_pmcid = pmc; + if (PMC_CALL(PMCGETMSR, &gm) < 0) + return (-1); + *msr = gm.pm_msr; + return (0); +} + +int +pmc_init(void) +{ + int error, pmc_mod_id; + unsigned int n; + uint32_t abi_version; + struct module_stat pmc_modstat; + struct pmc_op_getcpuinfo op_cpu_info; +#if defined(__amd64__) || defined(__i386__) + int cpu_has_iaf_counters; + unsigned int t; +#endif + + if (pmc_syscall != -1) /* already inited */ + return (0); + + /* retrieve the system call number from the KLD */ + if ((pmc_mod_id = modfind(PMC_MODULE_NAME)) < 0) + return (-1); + + pmc_modstat.version = sizeof(struct module_stat); + if ((error = modstat(pmc_mod_id, &pmc_modstat)) < 0) + return (-1); + + pmc_syscall = pmc_modstat.data.intval; + + /* check the kernel module's ABI against our compiled-in version */ + abi_version = PMC_VERSION; + if (PMC_CALL(GETMODULEVERSION, &abi_version) < 0) + return (pmc_syscall = -1); + + /* ignore patch & minor numbers for the comparision */ + if ((abi_version & 0xFF000000) != (PMC_VERSION & 0xFF000000)) { + errno = EPROGMISMATCH; + return (pmc_syscall = -1); + } + + if (PMC_CALL(GETCPUINFO, &op_cpu_info) < 0) + return (pmc_syscall = -1); + + cpu_info.pm_cputype = op_cpu_info.pm_cputype; + cpu_info.pm_ncpu = op_cpu_info.pm_ncpu; + cpu_info.pm_npmc = op_cpu_info.pm_npmc; + cpu_info.pm_nclass = op_cpu_info.pm_nclass; + for (n = 0; n < cpu_info.pm_nclass; n++) + cpu_info.pm_classes[n] = op_cpu_info.pm_classes[n]; + + pmc_class_table = malloc(PMC_CLASS_TABLE_SIZE * + sizeof(struct pmc_class_descr *)); + + if (pmc_class_table == NULL) + return (-1); + + for (n = 0; n < PMC_CLASS_TABLE_SIZE; n++) + pmc_class_table[n] = NULL; + + /* + * Fill in the class table. + */ + n = 0; +#if defined(__amd64__) || defined(__i386__) + pmc_class_table[n++] = &tsc_class_table_descr; + + /* + * Check if this CPU has fixed function counters. + */ + cpu_has_iaf_counters = 0; + for (t = 0; t < cpu_info.pm_nclass; t++) + if (cpu_info.pm_classes[t].pm_class == PMC_CLASS_IAF && + cpu_info.pm_classes[t].pm_num > 0) + cpu_has_iaf_counters = 1; +#endif + +#define PMC_MDEP_INIT(C) do { \ + pmc_mdep_event_aliases = C##_aliases; \ + pmc_mdep_class_list = C##_pmc_classes; \ + pmc_mdep_class_list_size = \ + PMC_TABLE_SIZE(C##_pmc_classes); \ + } while (0) + +#define PMC_MDEP_INIT_INTEL_V2(C) do { \ + PMC_MDEP_INIT(C); \ + pmc_class_table[n++] = &iaf_class_table_descr; \ + if (!cpu_has_iaf_counters) \ + pmc_mdep_event_aliases = \ + C##_aliases_without_iaf; \ + pmc_class_table[n] = &C##_class_table_descr; \ + } while (0) + + /* Configure the event name parser. */ + switch (cpu_info.pm_cputype) { +#if defined(__i386__) + case PMC_CPU_AMD_K7: + PMC_MDEP_INIT(k7); + pmc_class_table[n] = &k7_class_table_descr; + break; + case PMC_CPU_INTEL_P5: + PMC_MDEP_INIT(p5); + pmc_class_table[n] = &p5_class_table_descr; + break; + case PMC_CPU_INTEL_P6: /* P6 ... Pentium M CPUs have */ + case PMC_CPU_INTEL_PII: /* similar PMCs. */ + case PMC_CPU_INTEL_PIII: + case PMC_CPU_INTEL_PM: + PMC_MDEP_INIT(p6); + pmc_class_table[n] = &p6_class_table_descr; + break; +#endif +#if defined(__amd64__) || defined(__i386__) + case PMC_CPU_AMD_K8: + PMC_MDEP_INIT(k8); + pmc_class_table[n] = &k8_class_table_descr; + break; + case PMC_CPU_INTEL_ATOM: + PMC_MDEP_INIT_INTEL_V2(atom); + break; + case PMC_CPU_INTEL_CORE: + PMC_MDEP_INIT(core); + pmc_class_table[n] = &core_class_table_descr; + break; + case PMC_CPU_INTEL_CORE2: + case PMC_CPU_INTEL_CORE2EXTREME: + PMC_MDEP_INIT_INTEL_V2(core2); + break; + case PMC_CPU_INTEL_COREI7: + pmc_class_table[n++] = &ucf_class_table_descr; + pmc_class_table[n++] = &corei7uc_class_table_descr; + PMC_MDEP_INIT_INTEL_V2(corei7); + break; + case PMC_CPU_INTEL_WESTMERE: + pmc_class_table[n++] = &ucf_class_table_descr; + pmc_class_table[n++] = &westmereuc_class_table_descr; + PMC_MDEP_INIT_INTEL_V2(westmere); + break; + case PMC_CPU_INTEL_PIV: + PMC_MDEP_INIT(p4); + pmc_class_table[n] = &p4_class_table_descr; + break; +#endif +#if defined(__XSCALE__) + case PMC_CPU_INTEL_XSCALE: + PMC_MDEP_INIT(xscale); + pmc_class_table[n] = &xscale_class_table_descr; + break; +#endif +#if defined(__mips__) + case PMC_CPU_MIPS_24K: + PMC_MDEP_INIT(mips24k); + pmc_class_table[n] = &mips24k_class_table_descr; + break; +#endif /* __mips__ */ + default: + /* + * Some kind of CPU this version of the library knows nothing + * about. This shouldn't happen since the abi version check + * should have caught this. + */ + errno = ENXIO; + return (pmc_syscall = -1); + } + + return (0); +} + +const char * +pmc_name_of_capability(enum pmc_caps cap) +{ + int i; + + /* + * 'cap' should have a single bit set and should be in + * range. + */ + if ((cap & (cap - 1)) || cap < PMC_CAP_FIRST || + cap > PMC_CAP_LAST) { + errno = EINVAL; + return (NULL); + } + + i = ffs(cap); + return (pmc_capability_names[i - 1]); +} + +const char * +pmc_name_of_class(enum pmc_class pc) +{ + if ((int) pc >= PMC_CLASS_FIRST && + pc <= PMC_CLASS_LAST) + return (pmc_class_names[pc]); + + errno = EINVAL; + return (NULL); +} + +const char * +pmc_name_of_cputype(enum pmc_cputype cp) +{ + size_t n; + + for (n = 0; n < PMC_TABLE_SIZE(pmc_cputype_names); n++) + if (cp == pmc_cputype_names[n].pm_cputype) + return (pmc_cputype_names[n].pm_name); + + errno = EINVAL; + return (NULL); +} + +const char * +pmc_name_of_disposition(enum pmc_disp pd) +{ + if ((int) pd >= PMC_DISP_FIRST && + pd <= PMC_DISP_LAST) + return (pmc_disposition_names[pd]); + + errno = EINVAL; + return (NULL); +} + +const char * +_pmc_name_of_event(enum pmc_event pe, enum pmc_cputype cpu) +{ + const struct pmc_event_descr *ev, *evfence; + + ev = evfence = NULL; + if (pe >= PMC_EV_IAF_FIRST && pe <= PMC_EV_IAF_LAST) { + ev = iaf_event_table; + evfence = iaf_event_table + PMC_EVENT_TABLE_SIZE(iaf); + } else if (pe >= PMC_EV_IAP_FIRST && pe <= PMC_EV_IAP_LAST) { + switch (cpu) { + case PMC_CPU_INTEL_ATOM: + ev = atom_event_table; + evfence = atom_event_table + PMC_EVENT_TABLE_SIZE(atom); + break; + case PMC_CPU_INTEL_CORE: + ev = core_event_table; + evfence = core_event_table + PMC_EVENT_TABLE_SIZE(core); + break; + case PMC_CPU_INTEL_CORE2: + case PMC_CPU_INTEL_CORE2EXTREME: + ev = core2_event_table; + evfence = core2_event_table + PMC_EVENT_TABLE_SIZE(core2); + break; + case PMC_CPU_INTEL_COREI7: + ev = corei7_event_table; + evfence = corei7_event_table + PMC_EVENT_TABLE_SIZE(corei7); + break; + case PMC_CPU_INTEL_WESTMERE: + ev = westmere_event_table; + evfence = westmere_event_table + PMC_EVENT_TABLE_SIZE(westmere); + break; + default: /* Unknown CPU type. */ + break; + } + } else if (pe >= PMC_EV_UCF_FIRST && pe <= PMC_EV_UCF_LAST) { + ev = ucf_event_table; + evfence = ucf_event_table + PMC_EVENT_TABLE_SIZE(ucf); + } else if (pe >= PMC_EV_UCP_FIRST && pe <= PMC_EV_UCP_LAST) { + switch (cpu) { + case PMC_CPU_INTEL_COREI7: + ev = corei7uc_event_table; + evfence = corei7uc_event_table + PMC_EVENT_TABLE_SIZE(corei7uc); + break; + case PMC_CPU_INTEL_WESTMERE: + ev = westmereuc_event_table; + evfence = westmereuc_event_table + PMC_EVENT_TABLE_SIZE(westmereuc); + break; + default: /* Unknown CPU type. */ + break; + } + } else if (pe >= PMC_EV_K7_FIRST && pe <= PMC_EV_K7_LAST) { + ev = k7_event_table; + evfence = k7_event_table + PMC_EVENT_TABLE_SIZE(k7); + } else if (pe >= PMC_EV_K8_FIRST && pe <= PMC_EV_K8_LAST) { + ev = k8_event_table; + evfence = k8_event_table + PMC_EVENT_TABLE_SIZE(k8); + } else if (pe >= PMC_EV_P4_FIRST && pe <= PMC_EV_P4_LAST) { + ev = p4_event_table; + evfence = p4_event_table + PMC_EVENT_TABLE_SIZE(p4); + } else if (pe >= PMC_EV_P5_FIRST && pe <= PMC_EV_P5_LAST) { + ev = p5_event_table; + evfence = p5_event_table + PMC_EVENT_TABLE_SIZE(p5); + } else if (pe >= PMC_EV_P6_FIRST && pe <= PMC_EV_P6_LAST) { + ev = p6_event_table; + evfence = p6_event_table + PMC_EVENT_TABLE_SIZE(p6); + } else if (pe >= PMC_EV_XSCALE_FIRST && pe <= PMC_EV_XSCALE_LAST) { + ev = xscale_event_table; + evfence = xscale_event_table + PMC_EVENT_TABLE_SIZE(xscale); + } else if (pe >= PMC_EV_MIPS24K_FIRST && pe <= PMC_EV_MIPS24K_LAST) { + ev = mips24k_event_table; + evfence = mips24k_event_table + PMC_EVENT_TABLE_SIZE(mips24k +); + } else if (pe == PMC_EV_TSC_TSC) { + ev = tsc_event_table; + evfence = tsc_event_table + PMC_EVENT_TABLE_SIZE(tsc); + } + + for (; ev != evfence; ev++) + if (pe == ev->pm_ev_code) + return (ev->pm_ev_name); + + return (NULL); +} + +const char * +pmc_name_of_event(enum pmc_event pe) +{ + const char *n; + + if ((n = _pmc_name_of_event(pe, cpu_info.pm_cputype)) != NULL) + return (n); + + errno = EINVAL; + return (NULL); +} + +const char * +pmc_name_of_mode(enum pmc_mode pm) +{ + if ((int) pm >= PMC_MODE_FIRST && + pm <= PMC_MODE_LAST) + return (pmc_mode_names[pm]); + + errno = EINVAL; + return (NULL); +} + +const char * +pmc_name_of_state(enum pmc_state ps) +{ + if ((int) ps >= PMC_STATE_FIRST && + ps <= PMC_STATE_LAST) + return (pmc_state_names[ps]); + + errno = EINVAL; + return (NULL); +} + +int +pmc_ncpu(void) +{ + if (pmc_syscall == -1) { + errno = ENXIO; + return (-1); + } + + return (cpu_info.pm_ncpu); +} + +int +pmc_npmc(int cpu) +{ + if (pmc_syscall == -1) { + errno = ENXIO; + return (-1); + } + + if (cpu < 0 || cpu >= (int) cpu_info.pm_ncpu) { + errno = EINVAL; + return (-1); + } + + return (cpu_info.pm_npmc); +} + +int +pmc_pmcinfo(int cpu, struct pmc_pmcinfo **ppmci) +{ + int nbytes, npmc; + struct pmc_op_getpmcinfo *pmci; + + if ((npmc = pmc_npmc(cpu)) < 0) + return (-1); + + nbytes = sizeof(struct pmc_op_getpmcinfo) + + npmc * sizeof(struct pmc_info); + + if ((pmci = calloc(1, nbytes)) == NULL) + return (-1); + + pmci->pm_cpu = cpu; + + if (PMC_CALL(GETPMCINFO, pmci) < 0) { + free(pmci); + return (-1); + } + + /* kernel<->library, library<->userland interfaces are identical */ + *ppmci = (struct pmc_pmcinfo *) pmci; + return (0); +} + +int +pmc_read(pmc_id_t pmc, pmc_value_t *value) +{ + struct pmc_op_pmcrw pmc_read_op; + + pmc_read_op.pm_pmcid = pmc; + pmc_read_op.pm_flags = PMC_F_OLDVALUE; + pmc_read_op.pm_value = -1; + + if (PMC_CALL(PMCRW, &pmc_read_op) < 0) + return (-1); + + *value = pmc_read_op.pm_value; + return (0); +} + +int +pmc_release(pmc_id_t pmc) +{ + struct pmc_op_simple pmc_release_args; + + pmc_release_args.pm_pmcid = pmc; + return (PMC_CALL(PMCRELEASE, &pmc_release_args)); +} + +int +pmc_rw(pmc_id_t pmc, pmc_value_t newvalue, pmc_value_t *oldvaluep) +{ + struct pmc_op_pmcrw pmc_rw_op; + + pmc_rw_op.pm_pmcid = pmc; + pmc_rw_op.pm_flags = PMC_F_NEWVALUE | PMC_F_OLDVALUE; + pmc_rw_op.pm_value = newvalue; + + if (PMC_CALL(PMCRW, &pmc_rw_op) < 0) + return (-1); + + *oldvaluep = pmc_rw_op.pm_value; + return (0); +} + +int +pmc_set(pmc_id_t pmc, pmc_value_t value) +{ + struct pmc_op_pmcsetcount sc; + + sc.pm_pmcid = pmc; + sc.pm_count = value; + + if (PMC_CALL(PMCSETCOUNT, &sc) < 0) + return (-1); + return (0); +} + +int +pmc_start(pmc_id_t pmc) +{ + struct pmc_op_simple pmc_start_args; + + pmc_start_args.pm_pmcid = pmc; + return (PMC_CALL(PMCSTART, &pmc_start_args)); +} + +int +pmc_stop(pmc_id_t pmc) +{ + struct pmc_op_simple pmc_stop_args; + + pmc_stop_args.pm_pmcid = pmc; + return (PMC_CALL(PMCSTOP, &pmc_stop_args)); +} + +int +pmc_width(pmc_id_t pmcid, uint32_t *width) +{ + unsigned int i; + enum pmc_class cl; + + cl = PMC_ID_TO_CLASS(pmcid); + for (i = 0; i < cpu_info.pm_nclass; i++) + if (cpu_info.pm_classes[i].pm_class == cl) { + *width = cpu_info.pm_classes[i].pm_width; + return (0); + } + errno = EINVAL; + return (-1); +} + +int +pmc_write(pmc_id_t pmc, pmc_value_t value) +{ + struct pmc_op_pmcrw pmc_write_op; + + pmc_write_op.pm_pmcid = pmc; + pmc_write_op.pm_flags = PMC_F_NEWVALUE; + pmc_write_op.pm_value = value; + return (PMC_CALL(PMCRW, &pmc_write_op)); +} + +int +pmc_writelog(uint32_t userdata) +{ + struct pmc_op_writelog wl; + + wl.pm_userdata = userdata; + return (PMC_CALL(WRITELOG, &wl)); +} diff --git a/lib/libpmc/libpmcinternal.h b/lib/libpmc/libpmcinternal.h new file mode 100644 index 0000000..b1c9c86 --- /dev/null +++ b/lib/libpmc/libpmcinternal.h @@ -0,0 +1,37 @@ +/*- + * Copyright (c) 2008 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef LIBPMC_INTERNAL_H +#define LIBPMC_INTERNAL_H 1 + +/* + * Prototypes. + */ +const char *_pmc_name_of_event(enum pmc_event _ev, enum pmc_cputype _cpu); + +#endif /* LIBPMC_INTERNAL_H */ diff --git a/lib/libpmc/pmc.3 b/lib/libpmc/pmc.3 new file mode 100644 index 0000000..2403e64 --- /dev/null +++ b/lib/libpmc/pmc.3 @@ -0,0 +1,540 @@ +.\" Copyright (c) 2003-2008 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd November 24, 2008 +.Dt PMC 3 +.Os +.Sh NAME +.Nm pmc +.Nd library for accessing hardware performance monitoring counters +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +The +.Lb libpmc +provides a programming interface that allows applications to use +hardware performance counters to gather performance data about +specific processes or for the system as a whole. +The library is implemented using the lower-level facilities offered by +the +.Xr hwpmc 4 +driver. +.Ss Key Concepts +Performance monitoring counters (PMCs) are represented by the library +using a software abstraction. +These +.Dq abstract +PMCs can have two scopes: +.Bl -bullet +.It +System scope. +These PMCs measure events in a whole-system manner, i.e., independent +of the currently executing thread. +System scope PMCs are allocated on specific CPUs and do not +migrate between CPUs. +Non-privileged process are allowed to allocate system scope PMCs if the +.Xr hwpmc 4 +sysctl tunable: +.Va security.bsd.unprivileged_syspmcs +is non-zero. +.It +Process scope. +These PMCs only measure hardware events when the processes they are +attached to are executing on a CPU. +In an SMP system, process scope PMCs migrate between CPUs along with +their target processes. +.El +.Pp +Orthogonal to PMC scope, PMCs may be allocated in one of two +operational modes: +.Bl -bullet +.It +Counting PMCs measure events according to their scope +(system or process). +The application needs to explicitly read these counters +to retrieve their value. +.It +Sampling PMCs cause the CPU to be periodically interrupted +and information about its state of execution to be collected. +Sampling PMCs are used to profile specific processes and kernel +threads or to profile the system as a whole. +.El +.Pp +The scope and operational mode for a software PMC are specified at +PMC allocation time. +An application is allowed to allocate multiple PMCs subject +to availability of hardware resources. +.Pp +The library uses human-readable strings to name the event being +measured by hardware. +The syntax used for specifying a hardware event along with additional +event specific qualifiers (if any) is described in detail in section +.Sx "EVENT SPECIFIERS" +below. +.Pp +PMCs are associated with the process that allocated them and +will be automatically reclaimed by the system when the process exits. +Additionally, process-scope PMCs have to be attached to one or more +target processes before they can perform measurements. +A process-scope PMC may be attached to those target processes +that its owner process would otherwise be permitted to debug. +An owner process may attach PMCs to itself allowing +it to measure its own behavior. +Additionally, on some machine architectures, such self-attached PMCs +may be read cheaply using specialized instructions supported by the +processor. +.Pp +Certain kinds of PMCs require that a log file be configured before +they may be started. +These include: +.Bl -bullet -compact +.It +System scope sampling PMCs. +.It +Process scope sampling PMCs. +.It +Process scope counting PMCs that have been configured to report PMC +readings on process context switches or process exits. +.El +Up to one log file may be configured per owner process. +Events logged to a log file may be subsequently analyzed using the +.Xr pmclog 3 +family of functions. +.Ss Supported CPUs +The CPUs known to the PMC library are named by the +.Vt "enum pmc_cputype" +enumeration. +Supported CPUs include: +.Bl -tag -width "Li PMC_CPU_INTEL_CORE2" -compact +.It Li PMC_CPU_AMD_K7 +.Tn "AMD Athlon" +CPUs. +.It Li PMC_CPU_AMD_K8 +.Tn "AMD Athlon64" +CPUs. +.It Li PMC_CPU_INTEL_ATOM +.Tn Intel +.Tn Atom +CPUs and other CPUs conforming to version 3 of the +.Tn Intel +performance measurement architecture. +.It Li PMC_CPU_INTEL_CORE +.Tn Intel +.Tn Core Solo +and +.Tn Core Duo +CPUs, and other CPUs conforming to version 1 of the +.Tn Intel +performance measurement architecture. +.It Li PMC_CPU_INTEL_CORE2 +.Tn Intel +.Tn "Core2 Solo" , +.Tn "Core2 Duo" +and +.Tn "Core2 Extreme" +CPUs, and other CPUs conforming to version 2 of the +.Tn Intel +performance measurement architecture. +.It Li PMC_CPU_INTEL_P5 +.Tn Intel +.Tn "Pentium" +CPUs. +.It Li PMC_CPU_INTEL_P6 +.Tn Intel +.Tn "Pentium Pro" +CPUs. +.It Li PMC_CPU_INTEL_PII +.Tn "Intel Pentium II" +CPUs. +.It Li PMC_CPU_INTEL_PIII +.Tn "Intel Pentium III" +CPUs. +.It Li PMC_CPU_INTEL_PIV +.Tn "Intel Pentium 4" +CPUs. +.It Li PMC_CPU_INTEL_PM +.Tn "Intel Pentium M" +CPUs. +.El +.Ss Supported PMCs +PMC supported by this library are named by the +.Vt enum pmc_class +enumeration. +Supported PMC kinds include: +.Bl -tag -width "Li PMC_CLASS_IAF" -compact +.It Li PMC_CLASS_IAF +Fixed function hardware counters presents in CPUs conforming to the +.Tn Intel +performance measurement architecture version 2 and later. +.It Li PMC_CLASS_IAP +Programmable hardware counters present in CPUs conforming to the +.Tn Intel +performance measurement architecture version 1 and later. +.It Li PMC_CLASS_K7 +Programmable hardware counters present in +.Tn "AMD Athlon" +CPUs. +.It Li PMC_CLASS_K8 +Programmable hardware counters present in +.Tn "AMD Athlon64" +CPUs. +.It Li PMC_CLASS_P4 +Programmable hardware counters present in +.Tn "Intel Pentium 4" +CPUs. +.It Li PMC_CLASS_P5 +Programmable hardware counters present in +.Tn Intel +.Tn Pentium +CPUs. +.It Li PMC_CLASS_P6 +Programmable hardware counters present in +.Tn Intel +.Tn "Pentium Pro" , +.Tn "Pentium II" , +.Tn "Pentium III" , +.Tn "Celeron" , +and +.Tn "Pentium M" +CPUs. +.It Li PMC_CLASS_TSC +The timestamp counter on i386 and amd64 architecture CPUs. +.El +.Ss PMC Capabilities +.Pp +Capabilities of performance monitoring hardware are denoted using +the +.Vt "enum pmc_caps" +enumeration. +Supported capabilities include: +.Bl -tag -width "Li PMC_CAP_INTERRUPT" -compact +.It Li PMC_CAP_CASCADE +The ability to cascade counters. +.It Li PMC_CAP_EDGE +The ability to count negated to asserted transitions of the hardware +conditions being probed for. +.It Li PMC_CAP_INTERRUPT +The ability to interrupt the CPU. +.It Li PMC_CAP_INVERT +The ability to invert the sense of the hardware conditions being +measured. +.It Li PMC_CAP_PRECISE +The ability to perform precise sampling. +.It Li PMC_CAP_QUALIFIER +The hardware allows monitored to be further qualified in some +system dependent way. +.It Li PMC_CAP_READ +The ability to read from performance counters. +.It Li PMC_CAP_SYSTEM +The ability to restrict counting of hardware events to when the CPU is +running privileged code. +.It Li PMC_CAP_THRESHOLD +The ability to ignore simultaneous hardware events below a +programmable threshold. +.It Li PMC_CAP_USER +The ability to restrict counting of hardware events to those when the +CPU is running unprivileged code. +.It Li PMC_CAP_WRITE +The ability to write to performance counters. +.El +.Ss CPU Naming Conventions +CPUs are named using small integers from zero up to, but +excluding, the value returned by function +.Fn pmc_ncpu . +On platforms supporting sparsely numbered CPUs not all the numbers in +this range will denote valid CPUs. +Operations on non-existent CPUs will return an error. +.Ss Functional Grouping of the API +This section contains a brief overview of the available functionality +in the PMC library. +Each function listed here is described further in its own manual page. +.Bl -tag -width indent +.It Administration +.Bl -tag -compact +.It Fn pmc_disable , Fn pmc_enable +Administratively disable (enable) specific performance monitoring +counter hardware. +Counters that are disabled will not be available to applications to +use. +.El +.It "Convenience Functions" +.Bl -tag -compact +.It Fn pmc_event_names_of_class +Returns a list of event names supported by a given PMC type. +.It Fn pmc_name_of_capability +Convert a +.Dv PMC_CAP_* +flag to a human-readable string. +.It Fn pmc_name_of_class +Convert a +.Dv PMC_CLASS_* +constant to a human-readable string. +.It Fn pmc_name_of_cputype +Return a human-readable name for a CPU type. +.It Fn pmc_name_of_disposition +Return a human-readable string describing a PMC's disposition. +.It Fn pmc_name_of_event +Convert a numeric event code to a human-readable string. +.It Fn pmc_name_of_mode +Convert a +.Dv PMC_MODE_* +constant to a human-readable name. +.It Fn pmc_name_of_state +Return a human-readable string describing a PMC's current state. +.El +.It "Library Initialization" +.Bl -tag -compact +.It Fn pmc_init +Initialize the library. +This function must be called before any other library function. +.El +.It "Log File Handling" +.Bl -tag -compact +.It Fn pmc_configure_logfile +Configure a log file for +.Xr hwpmc 4 +to write logged events to. +.It Fn pmc_flush_logfile +Flush all pending log data in +.Xr hwpmc 4 Ns Ap s +buffers. +.It Fn pmc_writelog +Append arbitrary user data to the current log file. +.El +.It "PMC Management" +.Bl -tag -compact +.It Fn pmc_allocate , Fn pmc_release +Allocate (free) a PMC. +.It Fn pmc_attach , Fn pmc_detach +Attach (detach) a process scope PMC to a target. +.It Fn pmc_read , Fn pmc_write , Fn pmc_rw +Read (write) a value from (to) a PMC. +.It Fn pmc_start , Fn pmc_stop +Start (stop) a software PMC. +.It Fn pmc_set +Set the reload value for a sampling PMC. +.El +.It "Queries" +.Bl -tag -compact +.It Fn pmc_capabilities +Retrieve the capabilities for a given PMC. +.It Fn pmc_cpuinfo +Retrieve information about the CPUs and PMC hardware present in the +system. +.It Fn pmc_get_driver_stats +Retrieve statistics maintained by +.Xr hwpmc 4 . +.It Fn pmc_ncpu +Determine the greatest possible CPU number on the system. +.It Fn pmc_npmc +Return the number of hardware PMCs present in a given CPU. +.It Fn pmc_pmcinfo +Return information about the state of a given CPU's PMCs. +.It Fn pmc_width +Determine the width of a hardware counter in bits. +.El +.It "x86 Architecture Specific API" +.Bl -tag -compact +.It Fn pmc_get_msr +Returns the processor model specific register number +associated with +.Fa pmc . +Applications may then use the x86 +.Ic RDPMC +instruction to directly read the contents of the PMC. +.El +.El +.Ss Signal Handling Requirements +Applications using PMCs are required to handle the following signals: +.Bl -tag -width ".Dv SIGBUS" +.It Dv SIGBUS +When the +.Xr hwpmc 4 +module is unloaded using +.Xr kldunload 8 , +processes that have PMCs allocated to them will be sent a +.Dv SIGBUS +signal. +.It Dv SIGIO +The +.Xr hwpmc 4 +driver will send a PMC owning process a +.Dv SIGIO +signal if: +.Bl -bullet +.It +If any process-mode PMC allocated by it loses all its +target processes. +.It +If the driver encounters an error when writing log data to a +configured log file. +This error may be retrieved by a subsequent call to +.Fn pmc_flush_logfile . +.El +.El +.Ss Typical Program Flow +.Bl -enum +.It +An application would first invoke function +.Fn pmc_init +to allow the library to initialize itself. +.It +Signal handling would then be set up. +.It +Next the application would allocate the PMCs it desires using function +.Fn pmc_allocate . +.It +Initial values for PMCs may be set using function +.Fn pmc_set . +.It +If a log file is necessary for the PMCs to work, it would +be configured using function +.Fn pmc_configure_logfile . +.It +Process scope PMCs would then be attached to their target processes +using function +.Fn pmc_attach . +.It +The PMCs would then be started using function +.Fn pmc_start . +.It +Once started, the values of counting PMCs may be read using function +.Fn pmc_read . +For PMCs that write events to the log file, this logged data would be +read and parsed using the +.Xr pmclog 3 +family of functions. +.It +PMCs are stopped using function +.Fn pmc_stop , +and process scope PMCs are detached from their targets using +function +.Fn pmc_detach . +.It +Before the process exits, its may release its PMCs using function +.Fn pmc_release . +Any configured log file may be closed using function +.Fn pmc_configure_logfile . +.El +.Sh EVENT SPECIFIERS +Event specifiers are strings comprising of an event name, followed by +optional parameters modifying the semantics of the hardware event +being probed. +Event names are PMC architecture dependent, but the PMC library defines +machine independent aliases for commonly used events. +.Pp +Event specifiers spellings are case-insensitive and space characters, +periods, underscores and hyphens are considered equivalent to each other. +Thus the event specifiers +.Qq "Example Event" , +.Qq "example-event" , +and +.Qq "EXAMPLE_EVENT" +are equivalent. +.Ss PMC Architecture Dependent Events +PMC architecture dependent event specifiers are described in the +following manual pages: +.Bl -column " PMC_CLASS_TSC " "MANUAL PAGE " +.It Em "PMC Class" Ta Em "Manual Page" +.It Li PMC_CLASS_IAF Ta Xr pmc.iaf 3 +.It Li PMC_CLASS_IAP Ta Xr pmc.atom 3 , Xr pmc.core 3 , Xr pmc.core2 3 +.It Li PMC_CLASS_K7 Ta Xr pmc.k7 3 +.It Li PMC_CLASS_K8 Ta Xr pmc.k8 3 +.It Li PMC_CLASS_P4 Ta Xr pmc.p4 3 +.It Li PMC_CLASS_P5 Ta Xr pmc.p5 3 +.It Li PMC_CLASS_P6 Ta Xr pmc.p6 3 +.It Li PMC_CLASS_TSC Ta Xr pmc.tsc 3 +.El +.Ss Event Name Aliases +Event name aliases are PMC-independent names for commonly used events. +The following aliases are known to this version of the +.Nm pmc +library: +.Bl -tag -width indent +.It Li branches +Measure the number of branches retired. +.It Li branch-mispredicts +Measure the number of retired branches that were mispredicted. +.It Li cycles +Measure processor cycles. +This event is implemented using the processor's Time Stamp Counter +register. +.It Li dc-misses +Measure the number of data cache misses. +.It Li ic-misses +Measure the number of instruction cache misses. +.It Li instructions +Measure the number of instructions retired. +.It Li interrupts +Measure the number of interrupts seen. +.It Li unhalted-cycles +Measure the number of cycles the processor is not in a halted +or sleep state. +.El +.Sh COMPATIBILITY +The interface between the +.Nm pmc +library and the +.Xr hwpmc 4 +driver is intended to be private to the implementation and may +change. +In order to ease forward compatibility with future versions of the +.Xr hwpmc 4 +driver, applications are urged to dynamically link with the +.Nm pmc +library. +.Pp +The +.Nm pmc +API is +.Ud +.Sh SEE ALSO +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.core2 3 , +.Xr pmc.iaf 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmc.tsc 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 , +.Xr pmccontrol 8 , +.Xr pmcstat 8 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . diff --git a/lib/libpmc/pmc.atom.3 b/lib/libpmc/pmc.atom.3 new file mode 100644 index 0000000..a54d1db --- /dev/null +++ b/lib/libpmc/pmc.atom.3 @@ -0,0 +1,1193 @@ +.\" Copyright (c) 2008 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd November 12, 2008 +.Dt PMC.ATOM 3 +.Os +.Sh NAME +.Nm pmc.atom +.Nd measurement events for +.Tn Intel +.Tn Atom +family CPUs +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +.Tn Intel +.Tn Atom +CPUs contain PMCs conforming to version 3 of the +.Tn Intel +performance measurement architecture. +These CPUs contains two classes of PMCs: +.Bl -tag -width "Li PMC_CLASS_IAP" +.It Li PMC_CLASS_IAF +Fixed-function counters that count only one hardware event per counter. +.It Li PMC_CLASS_IAP +Programmable counters that may be configured to count one of a defined +set of hardware events. +.El +.Pp +The number of PMCs available in each class and their widths need to be +determined at run time by calling +.Xr pmc_cpuinfo 3 . +.Pp +Intel Atom PMCs are documented in +.Rs +.%B "IA-32 Intel(R) Architecture Software Developer's Manual" +.%T "Volume 3: System Programming Guide" +.%N "Order Number 253669-027US" +.%D July 2008 +.%Q "Intel Corporation" +.Re +.Ss ATOM FIXED FUNCTION PMCS +These PMCs and their supported events are documented in +.Xr pmc.iaf 3 . +.Ss ATOM PROGRAMMABLE PMCS +The programmable PMCs support the following capabilities: +.Bl -column "PMC_CAP_INTERRUPT" "Support" +.It Em Capability Ta Em Support +.It PMC_CAP_CASCADE Ta \&No +.It PMC_CAP_EDGE Ta Yes +.It PMC_CAP_INTERRUPT Ta Yes +.It PMC_CAP_INVERT Ta Yes +.It PMC_CAP_READ Ta Yes +.It PMC_CAP_PRECISE Ta \&No +.It PMC_CAP_SYSTEM Ta Yes +.It PMC_CAP_TAGGING Ta \&No +.It PMC_CAP_THRESHOLD Ta Yes +.It PMC_CAP_USER Ta Yes +.It PMC_CAP_WRITE Ta Yes +.El +.Ss Event Qualifiers +Event specifiers for these PMCs support the following common +qualifiers: +.Bl -tag -width indent +.It Li any +Count matching events seen on any logical processor in a package. +.It Li cmask= Ns Ar value +Configure the PMC to increment only if the number of configured +events measured in a cycle is greater than or equal to +.Ar value . +.It Li edge +Configure the PMC to count the number of de-asserted to asserted +transitions of the conditions expressed by the other qualifiers. +If specified, the counter will increment only once whenever a +condition becomes true, irrespective of the number of clocks during +which the condition remains true. +.It Li inv +Invert the sense of comparison when the +.Dq Li cmask +qualifier is present, making the counter increment when the number of +events per cycle is less than the value specified by the +.Dq Li cmask +qualifier. +.It Li os +Configure the PMC to count events happening at processor privilege +level 0. +.It Li usr +Configure the PMC to count events occurring at privilege levels 1, 2 +or 3. +.El +.Pp +If neither of the +.Dq Li os +or +.Dq Li usr +qualifiers are specified, the default is to enable both. +.Pp +Events that require core-specificity to be specified use a +additional qualifier +.Dq Li core= Ns Ar core , +where argument +.Ar core +is one of: +.Bl -tag -width indent +.It Li all +Measure event conditions on all cores. +.It Li this +Measure event conditions on this core. +.El +.Pp +The default is +.Dq Li this . +.Pp +Events that require an agent qualifier to be specified use an +additional qualifier +.Dq Li agent= Ns agent , +where argument +.Ar agent +is one of: +.Bl -tag -width indent +.It Li this +Measure events associated with this bus agent. +.It Li any +Measure events caused by any bus agent. +.El +.Pp +The default is +.Dq Li this . +.Pp +Events that require a hardware prefetch qualifier to be specified use an +additional qualifier +.Dq Li prefetch= Ns Ar prefetch , +where argument +.Ar prefetch +is one of: +.Bl -tag -width "exclude" +.It Li both +Include all prefetches. +.It Li only +Only count hardware prefetches. +.It Li exclude +Exclude hardware prefetches. +.El +.Pp +The default is +.Dq Li both . +.Pp +Events that require a cache coherence qualifier to be specified use an +additional qualifier +.Dq Li cachestate= Ns Ar state , +where argument +.Ar state +contains one or more of the following letters: +.Bl -tag -width indent +.It Li e +Count cache lines in the exclusive state. +.It Li i +Count cache lines in the invalid state. +.It Li m +Count cache lines in the modified state. +.It Li s +Count cache lines in the shared state. +.El +.Pp +The default is +.Dq Li eims . +.Pp +Events that require a snoop response qualifier to be specified use an +additional qualifier +.Dq Li snoopresponse= Ns Ar response , +where argument +.Ar response +comprises of the following keywords separated by +.Dq + +signs: +.Bl -tag -width indent +.It Li clean +Measure CLEAN responses. +.It Li hit +Measure HIT responses. +.It Li hitm +Measure HITM responses. +.El +.Pp +The default is to measure all the above responses. +.Pp +Events that require a snoop type qualifier use an additional qualifier +.Dq Li snooptype= Ns Ar type , +where argument +.Ar type +comprises the one of the following keywords: +.Bl -tag -width indent +.It Li cmp2i +Measure CMP2I snoops. +.It Li cmp2s +Measure CMP2S snoops. +.El +.Pp +The default is to measure both snoops. +.Ss Event Specifiers (Programmable PMCs) +Atom programmable PMCs support the following events: +.Bl -tag -width indent +.It Li BACLEARS +.Pq Event E6H , Umask 01H +The number of times the front end is resteered. +.It Li BOGUS_BR +.Pq Event E4H , Umask 00H +The number of byte sequences mistakenly detected as taken branch +instructions. +.It Li BR_BAC_MISSP_EXEC +.Pq Event 8AH , Umask 00H +The number of branch instructions that were mispredicted when +decoded. +.It Li BR_CALL_MISSP_EXEC +.Pq Event 93H , Umask 00H +The number of mispredicted +.Li CALL +instructions that were executed. +.It Li BR_CALL_EXEC +.Pq Event 92H , Umask 00H +The number of +.Li CALL +instructions executed. +.It Li BR_CND_EXEC +.Pq Event 8BH , Umask 00H +The number of conditional branches executed, but not necessarily retired. +.It Li BR_CND_MISSP_EXEC +.Pq Event 8CH , Umask 00H +The number of mispredicted conditional branches executed. +.It Li BR_IND_CALL_EXEC +.Pq Event 94H , Umask 00H +The number of indirect +.Li CALL +instructions executed. +.It Li BR_IND_EXEC +.Pq Event 8DH , Umask 00H +The number of indirect branch instructions executed. +.It Li BR_IND_MISSP_EXEC +.Pq Event 8EH , Umask 00H +The number of mispredicted indirect branch instructions executed. +.It Li BR_INST_DECODED +.Pq Event E0H , Umask 01H +The number of branch instructions decoded. +.It Li BR_INST_EXEC +.Pq Event 88H , Umask 00H +The number of branches executed, but not necessarily retired. +.It Li BR_INST_RETIRED.ANY +.Pq Event C4H , Umask 00H +.Pq Alias Qq "Branch Instruction Retired" +The number of branch instructions retired. +This is an architectural performance event. +.It Li BR_INST_RETIRED.ANY1 +.Pq Event C4H , Umask 0FH +The number of branch instructions retired that were mispredicted. +.It Li BR_INST_RETIRED.MISPRED +.Pq Event C5H , Umask 00H +.Pq Alias Qq "Branch Misses Retired" +The number of mispredicted branch instructions retired. +This is an architectural performance event. +.It Li BR_INST_RETIRED.MISPRED_NOT_TAKEN +.Pq Event C4H , Umask 02H +The number of not taken branch instructions retired that were +mispredicted. +.It Li BR_INST_RETIRED.MISPRED_TAKEN +.Pq Event C4H , Umask 08H +The number taken branch instructions retired that were mispredicted. +.It Li BR_INST_RETIRED.PRED_NOT_TAKEN +.Pq Event C4H , Umask 01H +The number of not taken branch instructions retired that were +correctly predicted. +.It Li BR_INST_RETIRED.PRED_TAKEN +.Pq Event C4H , Umask 04H +The number of taken branch instructions retired that were correctly +predicted. +.It Li BR_INST_RETIRED.TAKEN +.Pq Event C4H , Umask 0CH +The number of taken branch instructions retired. +.It Li BR_MISSP_EXEC +.Pq Event 89H , Umask 00H +The number of mispredicted branch instructions that were executed. +.It Li BR_RET_MISSP_EXEC +.Pq Event 90H , Umask 00H +The number of mispredicted +.Li RET +instructions executed. +.It Li BR_RET_BAC_MISSP_EXEC +.Pq Event 91H , Umask 00H +The number of +.Li RET +instructions executed that were mispredicted at decode time. +.It Li BR_RET_EXEC +.Pq Event 8FH , Umask 00H +The number of +.Li RET +instructions executed. +.It Li BR_TKN_BUBBLE_1 +.Pq Event 97H , Umask 00H +The number of branch predicted taken with bubble 1. +.It Li BR_TKN_BUBBLE_2 +.Pq Event 98H , Umask 00H +The number of branch predicted taken with bubble 2. +.It Li BUSQ_EMPTY Op ,core= Ns Ar core +.Pq Event 7DH +The number of cycles during which the core did not have any pending +transactions in the bus queue. +.It Li BUS_BNR_DRV Op ,agent= Ns Ar agent +.Pq Event 61H +The number of Bus Not Ready signals asserted on the bus. +This event is thread-independent. +.It Li BUS_DATA_RCV Op ,core= Ns Ar core +.Pq Event 64H +The number of bus cycles during which the processor is receiving data. +This event is thread-independent. +.It Li BUS_DRDY_CLOCKS Op ,agent= Ns Ar agent +.Pq Event 62H +The number of bus cycles during which the Data Ready signal is asserted +on the bus. +This event is thread-independent. +.It Li BUS_HIT_DRV Op ,agent= Ns Ar agent +.Pq Event 7AH +The number of bus cycles during which the processor drives the +.Li HIT# +pin. +This event is thread-independent. +.It Li BUS_HITM_DRV Op ,agent= Ns Ar agent +.Pq Event 7BH +The number of bus cycles during which the processor drives the +.Li HITM# +pin. +This event is thread-independent. +.It Li BUS_IO_WAIT Op ,core= Ns Ar core +.Pq Event 7FH +The number of core cycles during which I/O requests wait in the bus +queue. +.It Li BUS_LOCK_CLOCKS Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 63H +The number of bus cycles during which the +.Li LOCK +signal was asserted on the bus. +This event is thread independent. +.It Li BUS_REQUEST_OUTSTANDING Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 60H +The number of pending full cache line read transactions on the bus +occurring in each cycle. +This event is thread independent. +.It Li BUS_TRANS_P Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 6BH +The number of partial bus transactions. +.It Li BUS_TRANS_IFETCH Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 68H +The number of instruction fetch full cache line bus transactions. +.It Li BUS_TRANS_INVAL Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 69H +The number of invalidate bus transactions. +.It Li BUS_TRANS_PWR Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 6AH +The number of partial write bus transactions. +.It Li BUS_TRANS_DEF Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 6DH +The number of deferred bus transactions. +.It Li BUS_TRANS_BURST Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 6EH +The number of burst transactions. +.It Li BUS_TRANS_MEM Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 6FH +The number of memory bus transactions. +.It Li BUS_TRANS_ANY Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 70H +The number of bus transactions of any kind. +.It Li BUS_TRANS_BRD Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 65H +The number of burst read transactions. +.It Li BUS_TRANS_IO Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 6CH +The number of completed I/O bus transactions due to +.Li IN +and +.Li OUT +instructions. +.It Li BUS_TRANS_RFO Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 66H +The number of Read For Ownership bus transactions. +.It Li BUS_TRANS_WB Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 67H +The number explicit write-back bus transactions due to dirty line +evictions. +.It Li CMP_SNOOP Xo +.Op ,core= Ns Ar core +.Op ,snooptype= Ns Ar snoop +.Xc +.Pq Event 78H +The number of times the L1 data cache is snooped by the other core in +the same processor. +.It Li CPU_CLK_UNHALTED.BUS +.Pq Event 3CH , Umask 01H +.Pq Alias Qq "Unhalted Reference Cycles" +The number of bus cycles when the core is not in the halt state. +This is an architectural performance event. +.It Li CPU_CLK_UNHALTED.CORE_P +.Pq Event 3CH , Umask 00H +.Pq Alias Qq "Unhalted Core Cycles" +The number of core cycles while the core is not in a halt state. +This is an architectural performance event. +.It Li CPU_CLK_UNHALTED.NO_OTHER +.Pq Event 3CH , Umask 02H +The number of bus cycles during which the core remains unhalted and +the other core is halted. +.It Li CYCLES_DIV_BUSY +.Pq Event 14H , Umask 01H +The number of cycles the divider is busy. +.It Li CYCLES_INT_MASKED.CYCLES_INT_MASKED +.Pq Event C6H , Umask 01H +The number of cycles during which interrupts are disabled. +.It Li CYCLES_INT_MASKED.CYCLES_INT_PENDING_AND_MASKED +.Pq Event C6H , Umask 02H +The number of cycles during which there were pending interrupts while +interrupts were disabled. +.It Li CYCLES_L1I_MEM_STALLED +.Pq Event 86H , Umask 00H +The number of cycles for which an instruction fetch stalls. +.It Li DATA_TLB_MISSES.DTLB_MISS +.Pq Event 08H , Umask 07H +The number of memory access that missed the Data TLB +.It Li DATA_TLB_MISSES.DTLB_MISS_LD +.Pq Event 08H , Umask 05H +The number of loads that missed the Data TLB. +.It Li DATA_TLB_MISSES.DTLB_MISS_ST +.Pq Event 08H , Umask 06H +The number of stores that missed the Data TLB. +.It Li DATA_TLB_MISSES.UTLB_MISS_LD +.Pq Event 08H , Umask 09H +The number of loads that missed the UTLB. +.It Li DELAYED_BYPASS.FP +.Pq Event 19H , Umask 00H +The number of floating point operations that used data immediately +after the data was generated by a non floating point execution unit. +.It Li DELAYED_BYPASS.LOAD +.Pq Event 19H , Umask 01H +The number of delayed bypass penalty cycles that a load operation incurred. +.It Li DELAYED_BYPASS.SIMD +.Pq Event 19H , Umask 02H +The number of times SIMD operations use data immediately after data, +was generated by a non-SIMD execution unit. +.It Li DIV +.Pq Event 13H , Umask 00H +The number of divide operations executed. +This event is only available on PMC1. +.It Li DIV.AR +.Pq Event 13H , Umask 81H +The number of divide operations retired. +.It Li DIV.S +.Pq Event 13H , Umask 01H +The number of divide operations executed. +.It Li DTLB_MISSES.ANY +.Pq Event 08H , Umask 01H +The number of Data TLB misses, including misses that result from +speculative accesses. +.It Li DTLB_MISSES.L0_MISS_LD +.Pq Event 08H , Umask 04H +The number of level 0 DTLB misses due to load operations. +.It Li DTLB_MISSES.MISS_LD +.Pq Event 08H , Umask 02H +The number of Data TLB misses due to load operations. +.It Li DTLB_MISSES.MISS_ST +.Pq Event 08H , Umask 08H +The number of Data TLB misses due to store operations. +.It Li EIST_TRANS +.Pq Event 3AH , Umask 00H +The number of Enhanced Intel SpeedStep Technology transitions. +.It Li ESP.ADDITIONS +.Pq Event ABH , Umask 02H +The number of automatic additions to the +.Li %esp +register. +.It Li ESP.SYNCH +.Pq Event ABH , Umask 01H +The number of times the +.Li %esp +register was explicitly used in an address expression after +it is implicitly used by a +.Li PUSH +or +.Li POP +instruction. +.It Li EXT_SNOOP Xo +.Op ,agent= Ns Ar agent +.Op ,snoopresponse= Ns Ar response +.Xc +.Pq Event 77H +The number of snoop responses to bus transactions. +.It Li FP_ASSIST +.Pq Event 11H , Umask 01H +The number of floating point operations executed that needed +a microcode assist, including speculatively executed instructions. +.It Li FP_ASSIST.AR +.Pq Event 11H , Umask 81H +The number of floating point operations retired that needed +a microcode assist. +.It Li FP_COMP_OPS_EXE +.Pq Event 10H , Umask 00H +The number of floating point computational micro-ops executed. +The event is available only on PMC0. +.It Li FP_MMX_TRANS_TO_FP +.Pq Event CCH , Umask 02H +The number of transitions from MMX instructions to floating point +instructions. +.It Li FP_MMX_TRANS_TO_MMX +.Pq Event CCH , Umask 01H +The number of transitions from floating point instructions to MMX +instructions. +.It Li HW_INT_RCV +.Pq Event C8H , Umask 00H +The number of hardware interrupts received. +.It Li ICACHE.ACCESSES +.Pq Event 80H , Umask 03H +The number of instruction fetches. +.It Li ICACHE.MISSES +.Pq Event 80H , Umask 02H +The number of instruction fetches that miss the instruction cache. +.It Li IDLE_DURING_DIV +.Pq Event 18H , Umask 00H +The number of cycles the divider is busy and no other execution unit +or load operation was in progress. +This event is available only on PMC0. +.It Li ILD_STALL +.Pq Event 87H , Umask 00H +The number of cycles the instruction length decoder stalled due to a +length changing prefix. +.It Li INST_QUEUE.FULL +.Pq Event 83H , Umask 02H +The number of cycles during which the instruction queue is full. +.It Li INST_RETIRED.ANY_P +.Pq Event C0H , Umask 00H +.Pq Alias Qq "Instruction Retired" +The number of instructions retired. +This is an architectural performance event. +.It Li INST_RETIRED.LOADS +.Pq Event C0H , Umask 01H +The number of instructions retired that contained a load operation. +.It Li INST_RETIRED.OTHER +.Pq Event C0H , Umask 04H +The number of instructions retired that did not contain a load or a +store operation. +.It Li INST_RETIRED.STORES +.Pq Event C0H , Umask 02H +The number of instructions retired that contained a store operation. +.It Li ITLB.FLUSH +.Pq Event 82H , Umask 04H +The number of ITLB flushes. +.It Li ITLB.LARGE_MISS +.Pq Event 82H , Umask 10H +The number of instruction fetches from large pages that miss the +ITLB. +.It Li ITLB.MISSES +.Pq Event 82H , Umask 02H +The number of instruction fetches from both large and small pages that +miss the ITLB. +.It Li ITLB.SMALL_MISS +.Pq Event 82H , Umask 02H +The number of instruction fetches from small pages that miss the ITLB. +.It Li ITLB_MISS_RETIRED +.Pq Event C9H , Umask 00H +The number of retired instructions that missed the ITLB when they were +fetched. +.It Li L1D_ALL_REF +.Pq Event 43H , Umask 01H +The number of references to L1 data cache counting loads and stores of +to all memory types. +.It Li L1D_ALL_CACHE_REF +.Pq Event 43H , Umask 02H +The number of data reads and writes to cacheable memory. +.It Li L1D_CACHE_LOCK Op ,cachestate= Ns Ar state +.Pq Event 42H +The number of locked reads from cacheable memory. +.It Li L1D_CACHE_LOCK_DURATION +.Pq Event 42H , Umask 10H +The number of cycles during which any cache line is locked by any +locking instruction. +.It Li L1D_CACHE.LD +.Pq Event 40H , Umask 21H +The number of data reads from cacheable memory. +.It Li L1D_CACHE.ST +.Pq Event 41H , Umask 22H +The number of data writes to cacheable memory. +.It Li L1D_M_EVICT +.Pq Event 47H , Umask 00H +The number of modified cache lines evicted from L1 data cache. +.It Li L1D_M_REPL +.Pq Event 46H , Umask 00H +The number of modified lines allocated in L1 data cache. +.It Li L1D_PEND_MISS +.Pq Event 48H , Umask 00H +The total number of outstanding L1 data cache misses at any clock. +.It Li L1D_PREFETCH.REQUESTS +.Pq Event 4EH , Umask 10H +The number of times L1 data cache requested to prefetch a data cache +line. +.It Li L1D_REPL +.Pq Event 45H , Umask 0FH +The number of lines brought into L1 data cache. +.It Li L1D_SPLIT.LOADS +.Pq Event 49H , Umask 01H +The number of load operations that span two cache lines. +.It Li L1D_SPLIT.STORES +.Pq Event 49H , Umask 02H +The number of store operations that span two cache lines. +.It Li L1I_MISSES +.Pq Event 81H , Umask 00H +The number of instruction fetch unit misses. +.It Li L1I_READS +.Pq Event 80H , Umask 00H +The number of instruction fetches. +.It Li L2_ADS Op ,core= Ns core +.Pq Event 21H +The number of cycles that the L2 address bus is in use. +.It Li L2_DBUS_BUSY_RD Op ,core= Ns core +.Pq Event 23H +The number of core cycles during which the L2 data bus is busy +transferring data to the core. +.It Li L2_IFETCH Xo +.Op ,cachestate= Ns Ar state +.Op ,core= Ns Ar core +.Xc +.Pq Event 28H +The number of instruction cache line requests from the instruction +fetch unit. +.It Li L2_LD Xo +.Op ,cachestate= Ns Ar state +.Op ,core= Ns Ar core +.Op ,prefetch= Ns Ar prefetch +.Xc +.Pq Event 29H +The number of L2 cache read requests from L1 cache and L2 +prefetchers. +.It Li L2_LINES_IN Xo +.Op ,core= Ns Ar core +.Op ,prefetch= Ns Ar prefetch +.Xc +.Pq Event 24H +The number of cache lines allocated in L2 cache. +.It Li L2_LINES_OUT Xo +.Op ,core= Ns Ar core +.Op ,prefetch= Ns Ar prefetch +.Xc +.Pq Event 26H +The number of L2 cache lines evicted. +.It Li L2_LOCK Xo +.Op ,cachestate= Ns Ar state +.Op ,core= Ns Ar core +.Xc +.Pq Event 2BH +The number of locked accesses to cache lines that miss L1 data +cache. +.It Li L2_M_LINES_IN Op ,core= Ns Ar core +.Pq Event 25H +The number of L2 cache line modifications. +.It Li L2_M_LINES_OUT Xo +.Op ,core= Ns Ar core +.Op ,prefetch= Ns Ar prefetch +.Xc +.Pq Event 27H +The number of modified lines evicted from L2 cache. +.It Li L2_NO_REQ Op ,core= Ns Ar core +.Pq Event 32H +The number of cycles during which no L2 cache requests were pending +from a core. +.It Li L2_REJECT_BUSQ Xo +.Op ,cachestate= Ns Ar state +.Op ,core= Ns Ar core +.Op ,prefetch= Ns Ar prefetch +.Xc +.Pq Event 30H +The number of L2 cache requests that were rejected. +.It Li L2_RQSTS Xo +.Op ,cachestate= Ns Ar state +.Op ,core= Ns Ar core +.Op ,prefetch= Ns Ar prefetch +.Xc +.Pq Event 2EH +The number of completed L2 cache requests. +.It Li L2_RQSTS.SELF.DEMAND.I_STATE +.Pq Event 2EH , Umask 41H +.Pq Alias Qq "LLC Misses" +The number of completed L2 cache demand requests from this core that +missed the L2 cache. +This is an architectural performance event. +.It Li L2_RQSTS.SELF.DEMAND.MESI +.Pq Event 2EH , Umask 4FH +.Pq Alias Qq "LLC References" +The number of completed L2 cache demand requests from this core. +.It Li L2_ST Xo +.Op ,cachestate= Ns Ar state +.Op ,core= Ns Ar core +.Xc +.Pq Event 2AH +The number of store operations that miss the L1 cache and request data +from the L2 cache. +.It Li LOAD_BLOCK.L1D +.Pq Event 03H , Umask 20H +The number of loads blocked by the L1 data cache. +.It Li LOAD_BLOCK.OVERLAP_STORE +.Pq Event 03H , Umask 08H +The number of loads that partially overlap an earlier store or are +aliased with a previous store. +.It Li LOAD_BLOCK.STA +.Pq Event 03H , Umask 02H +The number of loads blocked by preceding stores whose address is yet +to be calculated. +.It Li LOAD_BLOCK.STD +.Pq Event 03H , Umask 04H +The number of loads blocked by preceding stores to the same address +whose data value is not known. +.It Li LOAD_BLOCK.UNTIL_RETIRE +.Pq Event 03H , Umask 10H +The number of load operations that were blocked until retirement. +.It Li LOAD_HIT_PRE +.Pq Event 4CH , Umask 00H +The number of load operations that conflicted with an prefetch to the +same cache line. +.It Li MACHINE_CLEARS.SMC +.Pq Event C3H , Umask 01H +The number of times a program writes to a code section. +.It Li MACHINE_NUKES.MEM_ORDER +.Pq Event C3H , Umask 04H +The number of times the execution pipeline was restarted due to a +memory ordering conflict or memory disambiguation misprediction. +.It Li MACRO_INSTS.ALL_DECODED +.Pq Event AAH , Umask 03H +The number of instructions decoded. +.It Li MACRO_INSTS.CISC_DECODED +.Pq Event AAH , Umask 02H +The number of complex instructions decoded. +.It Li MEMORY_DISAMBIGUATION.RESET +.Pq Event 09H , Umask 01H +The number of cycles during which memory disambiguation misprediction +occurs. +.It Li MEMORY_DISAMBIGUATION.SUCCESS +.Pq Event 09H , Umask 02H +The number of load operations that were successfully disambiguated. +.It Li MEM_LOAD_RETIRED.DTLB_MISS +.Pq Event CBH , Umask 04H +The number of retired load operations that missed the DTLB. +.It Li MEM_LOAD_RETIRED.L2_MISS +.Pq Event CBH , Umask 02H +The number of retired load operations that miss L2 cache. +.It Li MEM_LOAD_RETIRED.L2_HIT +.Pq Event CBH , Umask 01H +The number of retired load operations that hit L2 cache. +.It Li MEM_LOAD_RETIRED.L2_LINE_MISS +.Pq Event CBH , Umask 08H +The number of load operations that missed L2 cache and that caused a +bus request. +.It Li MUL +.Pq Event 12H , Umask 00H +The number of multiply operations executed. +This event is only available on PMC1. +.It Li MUL.AR +.Pq Event 12H , Umask 81H +The number of multiply operations retired. +.It Li MUL.S +.Pq Event 12H , Umask 01H +The number of multiply operations executed. +.It Li PAGE_WALKS.WALKS +.Pq Event 0CH , Umask 03H +The number of page walks executed due to an ITLB or DTLB miss. +.It Li PAGE_WALKS.CYCLES +.Pq Event 0CH , Umask 03H +.\" XXX Clarify. Identical event umask/event numbers. +The number of cycles spent in a page walk caused by an ITLB or DTLB +miss. +.It Li PREF_RQSTS_DN +.Pq Event F8H , Umask 00H +The number of downward prefetches issued from the Data Prefetch Logic +unit to L2 cache. +.It Li PREF_RQSTS_UP +.Pq Event F0H , Umask 00H +The number of upward prefetches issued from the Data Prefetch Logic +unit to L2 cache. +.It Li PREFETCH.PREFETCHNTA +.Pq Event 07H , Umask 08H +The number of +.Li PREFETCHNTA +instructions executed. +.It Li PREFETCH.PREFETCHT0 +.Pq Event 07H , Umask 01H +The number of +.Li PREFETCHT0 +instructions executed. +.It Li PREFETCH.SW_L2 +.Pq Event 07H , Umask 06H +The number of +.Li PREFETCHT1 +and +.Li PREFETCHT2 +instructions executed. +.It Li RAT_STALLS.ANY +.Pq Event D2H , Umask 0FH +The number of stall cycles due to any of +.Li RAT_STALLS.FLAGS +.Li RAT_STALLS.FPSW , +.Li RAT_STALLS.PARTIAL +and +.Li RAT_STALLS.ROB_READ_PORT . +.It Li RAT_STALLS.FLAGS +.Pq Event D2H , Umask 04H +The number of cycles execution stalled due to a flag register induced +stall. +.It Li RAT_STALLS.FPSW +.Pq Event D2H , Umask 08H +The number of times the floating point status word was written. +.It Li RAT_STALLS.PARTIAL_CYCLES +.Pq Event D2H , Umask 02H +The number of cycles of added instruction execution latency due to the +use of a register that was partially written by previous instructions. +.It Li RAT_STALLS.ROB_READ_PORT +.Pq Event D2H , Umask 01H +The number of cycles when ROB read port stalls occurred. +.It Li RESOURCE_STALLS.ANY +.Pq Event DCH , Umask 1FH +The number of cycles during which any resource related stall +occurred. +.It Li RESOURCE_STALLS.BR_MISS_CLEAR +.Pq Event DCH , Umask 10H +The number of cycles stalled due to branch misprediction. +.It Li RESOURCE_STALLS.FPCW +.Pq Event DCH , Umask 08H +The number of cycles stalled due to writing the floating point control +word. +.It Li RESOURCE_STALLS.LD_ST +.Pq Event DCH , Umask 04H +The number of cycles during which the number of loads and stores in +the pipeline exceeded their limits. +.It Li RESOURCE_STALLS.ROB_FULL +.Pq Event DCH , Umask 01H +The number of cycles when the reorder buffer was full. +.It Li RESOURCE_STALLS.RS_FULL +.Pq Event DCH , Umask 02H +The number of cycles during which the RS was full. +.It Li RS_UOPS_DISPATCHED +.Pq Event A0H , Umask 00H +The number of micro-ops dispatched for execution. +.It Li RS_UOPS_DISPATCHED.PORT0 +.Pq Event A1H , Umask 01H +The number of cycles micro-ops were dispatched for execution on port +0. +.It Li RS_UOPS_DISPATCHED.PORT1 +.Pq Event A1H , Umask 02H +The number of cycles micro-ops were dispatched for execution on port +1. +.It Li RS_UOPS_DISPATCHED.PORT2 +.Pq Event A1H , Umask 04H +The number of cycles micro-ops were dispatched for execution on port +2. +.It Li RS_UOPS_DISPATCHED.PORT3 +.Pq Event A1H , Umask 08H +The number of cycles micro-ops were dispatched for execution on port +3. +.It Li RS_UOPS_DISPATCHED.PORT4 +.Pq Event A1H , Umask 10H +The number of cycles micro-ops were dispatched for execution on port +4. +.It Li RS_UOPS_DISPATCHED.PORT5 +.Pq Event A1H , Umask 20H +The number of cycles micro-ops were dispatched for execution on port +5. +.It Li SB_DRAIN_CYCLES +.Pq Event 04H , Umask 01H +The number of cycles while the store buffer is draining. +.It Li SEGMENT_REG_LOADS.ANY +.Pq Event 06H , Umask 00H +The number of segment register loads. +.It Li SEG_REG_RENAMES.ANY +.Pq Event D5H , Umask 0FH +The number of times the any segment register was renamed. +.It Li SEG_REG_RENAMES.DS +.Pq Event D5H , Umask 02H +The number of times the +.Li %ds +register is renamed. +.It Li SEG_REG_RENAMES.ES +.Pq Event D5H , Umask 01H +The number of times the +.Li %es +register is renamed. +.It Li SEG_REG_RENAMES.FS +.Pq Event D5H , Umask 04H +The number of times the +.Li %fs +register is renamed. +.It Li SEG_REG_RENAMES.GS +.Pq Event D5H , Umask 08H +The number of times the +.Li %gs +register is renamed. +.It Li SEG_RENAME_STALLS.ANY +.Pq Event D4H , Umask 0FH +The number of stalls due to lack of resource to rename any segment +register. +.It Li SEG_RENAME_STALLS.DS +.Pq Event D4H , Umask 02H +The number of stalls due to lack of renaming resources for the +.Li %ds +register. +.It Li SEG_RENAME_STALLS.ES +.Pq Event D4H , Umask 01H +The number of stalls due to lack of renaming resources for the +.Li %es +register. +.It Li SEG_RENAME_STALLS.FS +.Pq Event D4H , Umask 04H +The number of stalls due to lack of renaming resources for the +.Li %fs +register. +.It Li SEG_RENAME_STALLS.GS +.Pq Event D4H , Umask 08H +The number of stalls due to lack of renaming resources for the +.Li %gs +register. +.It Li SIMD_ASSIST +.Pq Event CDH , Umask 00H +The number SIMD assists invoked. +.It Li SIMD_COMP_INST_RETIRED.PACKED_DOUBLE +.Pq Event CAH , Umask 04H +Then number of computational SSE2 packed double precision instructions +retired. +.It Li SIMD_COMP_INST_RETIRED.PACKED_SINGLE +.Pq Event CAH , Umask 01H +Then number of computational SSE2 packed single precision instructions +retired. +.It Li SIMD_COMP_INST_RETIRED.SCALAR_DOUBLE +.Pq Event CAH , Umask 08H +Then number of computational SSE2 scalar double precision instructions +retired. +.It Li SIMD_COMP_INST_RETIRED.SCALAR_SINGLE +.Pq Event CAH , Umask 02H +Then number of computational SSE2 scalar single precision instructions +retired. +.It Li SIMD_INSTR_RETIRED +.Pq Event CEH , Umask 00H +The number of retired SIMD instructions that use MMX registers. +.It Li SIMD_INST_RETIRED.ANY +.Pq Event C7H , Umask 1FH +The number of streaming SIMD instructions retired. +.It Li SIMD_INST_RETIRED.PACKED_DOUBLE +.Pq Event C7H , Umask 04H +The number of SSE2 packed double precision instructions retired. +.It Li SIMD_INST_RETIRED.PACKED_SINGLE +.Pq Event C7H , Umask 01H +The number of SSE packed single precision instructions retired. +.It Li SIMD_INST_RETIRED.SCALAR_DOUBLE +.Pq Event C7H , Umask 08H +The number of SSE2 scalar double precision instructions retired. +.It Li SIMD_INST_RETIRED.SCALAR_SINGLE +.Pq Event C7H , Umask 02H +The number of SSE scalar single precision instructions retired. +.It Li SIMD_INST_RETIRED.VECTOR +.Pq Event C7H , Umask 10H +The number of SSE2 vector instructions retired. +.It Li SIMD_SAT_INSTR_RETIRED +.Pq Event CFH , Umask 00H +The number of saturated arithmetic SIMD instructions retired. +.It Li SIMD_SAT_UOP_EXEC.AR +.Pq Event B1H , Umask 80H +The number of SIMD saturated arithmetic micro-ops retired. +.It Li SIMD_SAT_UOP_EXEC.S +.Pq Event B1H , Umask 00H +The number of SIMD saturated arithmetic micro-ops executed. +.It Li SIMD_UOPS_EXEC.AR +.Pq Event B0H , Umask 80H +The number of SIMD micro-ops retired. +.It Li SIMD_UOPS_EXEC.S +.Pq Event B0H , Umask 00H +The number of SIMD micro-ops executed. +.It Li SIMD_UOP_TYPE_EXEC.ARITHMETIC.AR +.Pq Event B3H , Umask A0H +The number of SIMD packed arithmetic micro-ops executed. +.It Li SIMD_UOP_TYPE_EXEC.ARITHMETIC.S +.Pq Event B3H , Umask 20H +The number of SIMD packed arithmetic micro-ops executed. +.It Li SIMD_UOP_TYPE_EXEC.LOGICAL.AR +.Pq Event B3H , Umask 90H +The number of SIMD packed logical micro-ops executed. +.It Li SIMD_UOP_TYPE_EXEC.LOGICAL.S +.Pq Event B3H , Umask 10H +The number of SIMD packed logical micro-ops executed. +.It Li SIMD_UOP_TYPE_EXEC.MUL.AR +.Pq Event B3H , Umask 81H +The number of SIMD packed multiply micro-ops retired. +.It Li SIMD_UOP_TYPE_EXEC.MUL.S +.Pq Event B3H , Umask 01H +The number of SIMD packed multiply micro-ops executed. +.It Li SIMD_UOP_TYPE_EXEC.PACK.AR +.Pq Event B3H , Umask 84H +The number of SIMD pack micro-ops retired. +.It Li SIMD_UOP_TYPE_EXEC.PACK.S +.Pq Event B3H , Umask 04H +The number of SIMD pack micro-ops executed. +.It Li SIMD_UOP_TYPE_EXEC.SHIFT.AR +.Pq Event B3H , Umask 82H +The number of SIMD packed shift micro-ops retired. +.It Li SIMD_UOP_TYPE_EXEC.SHIFT.S +.Pq Event B3H , Umask 02H +The number of SIMD packed shift micro-ops executed. +.It Li SIMD_UOP_TYPE_EXEC.UNPACK.AR +.Pq Event B3H , Umask 88H +The number of SIMD unpack micro-ops executed. +.It Li SIMD_UOP_TYPE_EXEC.UNPACK.S +.Pq Event B3H , Umask 08H +The number of SIMD unpack micro-ops executed. +.It Li SNOOP_STALL_DRV Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 7EH +The number of times the bus stalled for snoops. +This event is thread-independent. +.It Li SSE_PRE_EXEC.L2 +.Pq Event 07H , Umask 02H +The number of +.Li PREFETCHT1 +instructions executed. +.It Li SSE_PRE_EXEC.STORES +.Pq Event 07H , Umask 03H +The number of times SSE non-temporal store instructions were executed. +.It Li SSE_PRE_MISS.L1 +.Pq Event 4BH , Umask 01H +The number of times the +.Li PREFETCHT0 +instruction executed and missed all cache levels. +.It Li SSE_PRE_MISS.L2 +.Pq Event 4BH , Umask 02H +The number of times the +.Li PREFETCHT1 +instruction executed and missed all cache levels. +.It Li SSE_PRE_MISS.NTA +.Pq Event 4BH , Umask 00H +The number of times the +.Li PREFETCHNTA +instruction executed and missed all cache levels. +.It Li STORE_BLOCK.ORDER +.Pq Event 04H , Umask 02H +The number of cycles while a store was waiting for another store to be +globally observed. +.It Li STORE_BLOCK.SNOOP +.Pq Event 04H , Umask 08H +The number of cycles while a store was blocked due to a conflict with +an internal or external snoop. +.It Li STORE_FORWARDS.GOOD +.Pq Event 02H , Umask 81H +The number of times stored data was forwarded directly to a load. +.It Li THERMAL_TRIP +.Pq Event 3BH , Umask C0H +The number of thermal trips. +.It Li UOPS_RETIRED.LD_IND_BR +.Pq Event C2H , Umask 01H +The number of micro-ops retired that fused a load with another +operation. +.It Li UOPS_RETIRED.STD_STA +.Pq Event C2H , Umask 02H +The number of store address calculations that fused into one micro-op. +.It Li UOPS_RETIRED.MACRO_FUSION +.Pq Event C2H , Umask 04H +The number of times retired instruction pairs were fused into one +micro-op. +.It Li UOPS_RETIRED.FUSED +.Pq Event C2H , Umask 07H +The number of fused micro-ops retired. +.It Li UOPS_RETIRED.NON_FUSED +.Pq Event C2H , Umask 8H +The number of non-fused micro-ops retired. +.It Li UOPS_RETIRED.ANY +.Pq Event C2H , Umask 10H +The number of micro-ops retired. +.It Li X87_COMP_OPS_EXE.ANY.AR +.Pq Event 10H , Umask 81H +The number of x87 floating-point computational micro-ops retired. +.It Li X87_COMP_OPS_EXE.ANY.S +.Pq Event 10H , Umask 01H +The number of x87 floating-point computational micro-ops executed. +.It Li X87_OPS_RETIRED.ANY +.Pq Event C1H , Umask FEH +The number of floating point computational instructions retired. +.It Li X87_OPS_RETIRED.FXCH +.Pq Event C1H , Umask 01H +The number of +.Li FXCH +instructions retired. +.El +.Ss Event Name Aliases +The following table shows the mapping between the PMC-independent +aliases supported by +.Lb libpmc +and the underlying hardware events used on these CPUs. +.Bl -column "branch-mispredicts" "cpu_clk_unhalted.core_p" "PMC Class" +.It Em Alias Ta Em Event Ta Em PMC Class +.It Li branches Ta Li BR_INST_RETIRED.ANY Ta Li PMC_CLASS_IAP +.It Li branch-mispredicts Ta Li BR_INST_RETIRED.MISPRED Ta Li PMC_CLASS_IAP +.It Li ic-misses Ta Li ICACHE.MISSES Ta Li PMC_CLASS_IAP +.It Li instructions Ta Li INST_RETIRED.ANY_P Ta Li PMC_CLASS_IAF +.It Li interrupts Ta Li HW_INT_RCV Ta Li PMC_CLASS_IAP +.It Li unhalted-cycles Ta Li CPU_CLK_UNHALTED.CORE_P Ta Li PMC_CLASS_IAF +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.core 3 , +.Xr pmc.core2 3 , +.Xr pmc.iaf 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmc.tsc 3 , +.Xr pmc_cpuinfo 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . diff --git a/lib/libpmc/pmc.core.3 b/lib/libpmc/pmc.core.3 new file mode 100644 index 0000000..d32e62a --- /dev/null +++ b/lib/libpmc/pmc.core.3 @@ -0,0 +1,808 @@ +.\" Copyright (c) 2008 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd November 12, 2008 +.Dt PMC.CORE 3 +.Os +.Sh NAME +.Nm pmc.core +.Nd measurement events for +.Tn Intel +.Tn Core Solo +and +.Tn Core Duo +family CPUs +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +.Tn Intel +.Tn "Core Solo" +and +.Tn "Core Duo" +CPUs contain PMCs conforming to version 1 of the +.Tn Intel +performance measurement architecture. +.Pp +These PMCs are documented in +.Rs +.%B IA-32 Intel\(rg Architecture Software Developer's Manual +.%T Volume 3: System Programming Guide +.%N Order Number 253669-027US +.%D July 2008 +.%Q Intel Corporation +.Re +.Ss PMC Features +CPUs conforming to version 1 of the +.Tn Intel +performance measurement architecture contain two programmable PMCs of +class +.Li PMC_CLASS_IAP . +The PMCs are 40 bits width and offer the following capabilities: +.Bl -column "PMC_CAP_INTERRUPT" "Support" +.It Em Capability Ta Em Support +.It PMC_CAP_CASCADE Ta \&No +.It PMC_CAP_EDGE Ta Yes +.It PMC_CAP_INTERRUPT Ta Yes +.It PMC_CAP_INVERT Ta Yes +.It PMC_CAP_READ Ta Yes +.It PMC_CAP_PRECISE Ta \&No +.It PMC_CAP_SYSTEM Ta Yes +.It PMC_CAP_TAGGING Ta \&No +.It PMC_CAP_THRESHOLD Ta Yes +.It PMC_CAP_USER Ta Yes +.It PMC_CAP_WRITE Ta Yes +.El +.Ss Event Qualifiers +Event specifiers for these PMCs support the following common +qualifiers: +.Bl -tag -width indent +.It Li cmask= Ns Ar value +Configure the PMC to increment only if the number of configured +events measured in a cycle is greater than or equal to +.Ar value . +.It Li edge +Configure the PMC to count the number of de-asserted to asserted +transitions of the conditions expressed by the other qualifiers. +If specified, the counter will increment only once whenever a +condition becomes true, irrespective of the number of clocks during +which the condition remains true. +.It Li inv +Invert the sense of comparison when the +.Dq Li cmask +qualifier is present, making the counter increment when the number of +events per cycle is less than the value specified by the +.Dq Li cmask +qualifier. +.It Li os +Configure the PMC to count events happening at processor privilege +level 0. +.It Li usr +Configure the PMC to count events occurring at privilege levels 1, 2 +or 3. +.El +.Pp +If neither of the +.Dq Li os +or +.Dq Li usr +qualifiers are specified, the default is to enable both. +.Pp +Events that require core-specificity to be specified use a +additional qualifier +.Dq Li core= Ns Ar value , +where argument +.Ar value +is one of: +.Bl -tag -width indent -compact +.It Li all +Measure event conditions on all cores. +.It Li this +Measure event conditions on this core. +.El +The default is +.Dq Li this . +.Pp +Events that require an agent qualifier to be specified use an +additional qualifier +.Dq Li agent= Ns value , +where argument +.Ar value +is one of: +.Bl -tag -width indent -compact +.It Li this +Measure events associated with this bus agent. +.It Li any +Measure events caused by any bus agent. +.El +The default is +.Dq Li this . +.Pp +Events that require a hardware prefetch qualifier to be specified use an +additional qualifier +.Dq Li prefetch= Ns Ar value , +where argument +.Ar value +is one of: +.Bl -tag -width "exclude" -compact +.It Li both +Include all prefetches. +.It Li only +Only count hardware prefetches. +.It Li exclude +Exclude hardware prefetches. +.El +The default is +.Dq Li both . +.Pp +Events that require a cache coherence qualifier to be specified use an +additional qualifier +.Dq Li cachestate= Ns Ar value , +where argument +.Ar value +contains one or more of the following letters: +.Bl -tag -width indent -compact +.It Li e +Count cache lines in the exclusive state. +.It Li i +Count cache lines in the invalid state. +.It Li m +Count cache lines in the modified state. +.It Li s +Count cache lines in the shared state. +.El +The default is +.Dq Li eims . +.Ss Event Specifiers +The following event names are case insensitive. +Whitespace, hyphens and underscore characters in these names are +ignored. +.Pp +Core PMCs support the following events: +.Bl -tag -width indent +.It Li BAClears +.Pq Event E6H , Umask 00H +The number of BAClear conditions asserted. +.It Li BTB_Misses +.Pq Event E2H , Umask 00H +The number of branches for which the branch table buffer did not +produce a prediction. +.It Li Br_BAC_Missp_Exec +.Pq Event 8AH , Umask 00H +The number of branch instructions executed that were mispredicted at +the front end. +.It Li Br_Bogus +.Pq Event E4H , Umask 00H +The number of bogus branches. +.It Li Br_Call_Exec +.Pq Event 92H , Umask 00H +The number of +.Li CALL +instructions executed. +.It Li Br_Call_Missp_Exec +.Pq Event 93H , Umask 00H +The number of +.Li CALL +instructions executed that were mispredicted. +.It Li Br_Cnd_Exec +.Pq Event 8BH , Umask 00H +The number of conditional branch instructions executed. +.It Li Br_Cnd_Missp_Exec +.Pq Event 8CH , Umask 00H +The number of conditional branch instructions executed that were mispredicted. +.It Li Br_Ind_Call_Exec +.Pq Event 94H , Umask 00H +The number of indirect +.Li CALL +instructions executed. +.It Li Br_Ind_Exec +.Pq Event 8DH , Umask 00H +The number of indirect branches executed. +.It Li Br_Ind_Missp_Exec +.Pq Event 8EH , Umask 00H +The number of indirect branch instructions executed that were mispredicted. +.It Li Br_Inst_Exec +.Pq Event 88H , Umask 00H +The number of branch instructions executed including speculative branches. +.It Li Br_Instr_Decoded +.Pq Event E0H , Umask 00H +The number of branch instructions decoded. +.It Li Br_Instr_Ret +.Pq Event C4H , Umask 00H +.Pq Alias Qq "Branch Instruction Retired" +The number of branch instructions retired. +This is an architectural performance event. +.It Li Br_MisPred_Ret +.Pq Event C5H , Umask 00H +.Pq Alias Qq "Branch Misses Retired" +The number of mispredicted branch instructions retired. +This is an architectural performance event. +.It Li Br_MisPred_Taken_Ret +.Pq Event CAH , Umask 00H +The number of taken and mispredicted branches retired. +.It Li Br_Missp_Exec +.Pq Event 89H , Umask 00H +The number of branch instructions executed and mispredicted at +execution including branches that were not predicted. +.It Li Br_Ret_BAC_Missp_Exec +.Pq Event 91H , Umask 00H +The number of return branch instructions that were mispredicted at the +front end. +.It Li Br_Ret_Exec +.Pq Event 8FH , Umask 00H +The number of return branch instructions executed. +.It Li Br_Ret_Missp_Exec +.Pq Event 90H , Umask 00H +The number of return branch instructions executed that were mispredicted. +.It Li Br_Taken_Ret +.Pq Event C9H , Umask 00H +The number of taken branches retired. +.It Li Bus_BNR_Clocks +.Pq Event 61H , Umask 00H +The number of external bus cycles while BNR (bus not ready) was asserted. +.It Li Bus_DRDY_Clocks Op ,agent= Ns Ar agent +.Pq Event 62H , Umask 00H +The number of external bus cycles while DRDY was asserted. +.It Li Bus_Data_Rcv +.Pq Event 64H , Umask 40H +.\" XXX Using the description in Core2 PMC documentation. +The number of cycles during which the processor is busy receiving data. +.It Li Bus_Locks_Clocks Op ,core= Ns Ar core +.Pq Event 63H +The number of external bus cycles while the bus lock signal was asserted. +.It Li Bus_Not_In_Use Op ,core= Ns Ar core +.Pq Event 7DH +The number of cycles when there is no transaction from the core. +.It Li Bus_Req_Outstanding Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 60H +The weighted cycles of cacheable bus data read requests +from the data cache unit or hardware prefetcher. +.It Li Bus_Snoop_Stall +.Pq Event 7EH , Umask 00H +The number bus cycles while a bus snoop is stalled. +.It Li Bus_Snoops Xo +.Op ,agent= Ns Ar agent +.Op ,cachestate= Ns Ar mesi +.Xc +.Pq Event 77H +.\" XXX Using the description in Core2 PMC documentation. +The number of snoop responses to bus transactions. +.It Li Bus_Trans_Any Op ,agent= Ns Ar agent +.Pq Event 70H +The number of completed bus transactions. +.It Li Bus_Trans_Brd Op ,core= Ns Ar core +.Pq Event 65H +The number of read bus transactions. +.It Li Bus_Trans_Burst Op ,agent= Ns Ar agent +.Pq Event 6EH +The number of completed burst transactions. +Retried transactions may be counted more than once. +.It Li Bus_Trans_Def Op ,core= Ns Ar core +.Pq Event 6DH +The number of completed deferred transactions. +.It Li Bus_Trans_IO Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 6CH +The number of completed I/O transactions counting both reads and +writes. +.It Li Bus_Trans_Ifetch Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 68H +Completed instruction fetch transactions. +.It Li Bus_Trans_Inval Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 69H +The number completed invalidate transactions. +.It Li Bus_Trans_Mem Op ,agent= Ns Ar agent +.Pq Event 6FH +The number of completed memory transactions. +.It Li Bus_Trans_P Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 6BH +The number of completed partial transactions. +.It Li Bus_Trans_Pwr Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 6AH +The number of completed partial write transactions. +.It Li Bus_Trans_RFO Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 66H +The number of completed read-for-ownership transactions. +.It Li Bus_Trans_WB Op ,agent= Ns Ar agent +.Pq Event 67H +The number of completed write-back transactions from the data cache +unit, excluding L2 write-backs. +.It Li Cycles_Div_Busy +.Pq Event 14H , Umask 00H +The number of cycles the divider is busy. +The event is only available on PMC0. +.It Li Cycles_Int_Masked +.Pq Event C6H , Umask 00H +The number of cycles while interrupts were disabled. +.It Li Cycles_Int_Pending_Masked +.Pq Event C7H , Umask 00H +The number of cycles while interrupts were disabled and interrupts +were pending. +.It Li DCU_Snoop_To_Share Op ,core= Ns core +.Pq Event 78H +The number of data cache unit snoops to L1 cache lines in the shared +state. +.It Li DCache_Cache_Lock Op ,cachestate= Ns Ar mesi +.\" XXX needs clarification +.Pq Event 42H +The number of cacheable locked read operations to invalid state. +.It Li DCache_Cache_LD Op ,cachestate= Ns Ar mesi +.Pq Event 40H +The number of cacheable L1 data read operations. +.It Li DCache_Cache_ST Op ,cachestate= Ns Ar mesi +.Pq Event 41H +The number cacheable L1 data write operations. +.It Li DCache_M_Evict +.Pq Event 47H , Umask 00H +The number of M state data cache lines that were evicted. +.It Li DCache_M_Repl +.Pq Event 46H , Umask 00H +The number of M state data cache lines that were allocated. +.It Li DCache_Pend_Miss +.Pq Event 48H , Umask 00H +The weighted cycles an L1 miss was outstanding. +.It Li DCache_Repl +.Pq Event 45H , Umask 0FH +The number of data cache line replacements. +.It Li Data_Mem_Cache_Ref +.Pq Event 44H , Umask 02H +The number of cacheable read and write operations to L1 data cache. +.It Li Data_Mem_Ref +.Pq Event 43H , Umask 01H +The number of L1 data reads and writes, both cacheable and +un-cacheable. +.It Li Dbus_Busy Op ,core= Ns Ar core +.Pq Event 22H +The number of core cycles during which the data bus was busy. +.It Li Dbus_Busy_Rd Op ,core= Ns Ar core +.Pq Event 23H +The number of cycles during which the data bus was busy transferring +data to a core. +.It Li Div +.Pq Event 13H , Umask 00H +The number of divide operations including speculative operations for +integer and floating point divides. +This event can only be counted on PMC1. +.It Li Dtlb_Miss +.Pq Event 49H , Umask 00H +The number of data references that missed the TLB. +.It Li ESP_Uops +.Pq Event D7H , Umask 00H +The number of ESP folding instructions decoded. +.It Li EST_Trans Op ,trans= Ns Ar transition +.Pq Event 3AH +Count the number of Intel Enhanced SpeedStep transitions. +The argument +.Ar transition +can be one of the following values: +.Bl -tag -width indent -compact +.It Li any +(Umask 00H) Count all transitions. +.It Li frequency +(Umask 01H) Count frequency transitions. +.El +The default is +.Dq Li any . +.It Li FP_Assist +.Pq Event 11H , Umask 00H +The number of floating point operations that required microcode +assists. +The event is only available on PMC1. +.It Li FP_Comp_Instr_Ret +.Pq Event C1H , Umask 00H +The number of X87 floating point compute instructions retired. +The event is only available on PMC0. +.It Li FP_Comps_Op_Exe +.Pq Event 10H , Umask 00H +The number of floating point computational instructions executed. +.It Li FP_MMX_Trans +.Pq Event CCH , Umask 01H +The number of transitions from X87 to MMX. +.It Li Fused_Ld_Uops_Ret +.Pq Event DAH , Umask 01H +The number of fused load uops retired. +.It Li Fused_St_Uops_Ret +.Pq Event DAH , Umask 02H +The number of fused store uops retired. +.It Li Fused_Uops_Ret +.Pq Event DAH , Umask 00H +The number of fused uops retired. +.It Li HW_Int_Rx +.Pq Event C8H , Umask 00H +The number of hardware interrupts received. +.It Li ICache_Misses +.Pq Event 81H , Umask 00H +The number of instruction fetch misses in the instruction cache and +streaming buffers. +.It Li ICache_Reads +.Pq Event 80H , Umask 00H +The number of instruction fetches from the the instruction cache and +streaming buffers counting both cacheable and un-cacheable fetches. +.It Li IFU_Mem_Stall +.Pq Event 86H , Umask 00H +The number of cycles the instruction fetch unit was stalled while +waiting for data from memory. +.It Li ILD_Stall +.Pq Event 87H , Umask 00H +The number of instruction length decoder stalls. +.It Li ITLB_Misses +.Pq Event 85H , Umask 00H +The number of instruction TLB misses. +.It Li Instr_Decoded +.Pq Event D0H , Umask 00H +The number of instructions decoded. +.It Li Instr_Ret +.Pq Event C0H , Umask 00H +.Pq Alias Qq "Instruction Retired" +The number of instructions retired. +This is an architectural performance event. +.It Li L1_Pref_Req +.Pq Event 4FH , Umask 00H +The number of L1 prefetch request due to data cache misses. +.It Li L2_ADS Op ,core= Ns core +.Pq Event 21H +The number of L2 address strobes. +.It Li L2_IFetch Xo +.Op ,cachestate= Ns Ar mesi +.Op ,core= Ns Ar core +.Xc +.Pq Event 28H +The number of instruction fetches by the instruction fetch unit from +L2 cache including speculative fetches. +.It Li L2_LD Xo +.Op ,cachestate= Ns Ar mesi +.Op ,core= Ns Ar core +.Xc +.Pq Event 29H +The number of L2 cache reads. +.It Li L2_Lines_In Xo +.Op ,core= Ns Ar core +.Op ,prefetch= Ns Ar prefetch +.Xc +.Pq Event 24H +The number of L2 cache lines allocated. +.It Li L2_Lines_Out Xo +.Op ,core= Ns Ar core +.Op ,prefetch= Ns Ar prefetch +.Xc +.Pq Event 26H +The number of L2 cache lines evicted. +.It Li L2_M_Lines_In Op ,core= Ns Ar core +.Pq Event 25H +The number of L2 M state cache lines allocated. +.It Li L2_M_Lines_Out Xo +.Op ,core= Ns Ar core +.Op ,prefetch= Ns Ar prefetch +.Xc +.Pq Event 27H +The number of L2 M state cache lines evicted. +.It Li L2_No_Request_Cycles Xo +.Op ,cachestate= Ns Ar mesi +.Op ,core= Ns Ar core +.Op ,prefetch= Ns Ar prefetch +.Xc +.Pq Event 32H +The number of cycles there was no request to access L2 cache. +.It Li L2_Reject_Cycles Xo +.Op ,cachestate= Ns Ar mesi +.Op ,core= Ns Ar core +.Op ,prefetch= Ns Ar prefetch +.Xc +.Pq Event 30H +The number of cycles the L2 cache was busy and rejecting new requests. +.It Li L2_Rqsts Xo +.Op ,cachestate= Ns Ar mesi +.Op ,core= Ns Ar core +.Op ,prefetch= Ns Ar prefetch +.Xc +.Pq Event 2EH +The number of L2 cache requests. +.It Li L2_ST Xo +.Op ,cachestate= Ns Ar mesi +.Op ,core= Ns Ar core +.Xc +.Pq Event 2AH +The number of L2 cache writes including speculative writes. +.It Li LD_Blocks +.Pq Event 03H , Umask 00H +The number of load operations delayed due to store buffer blocks. +.It Li LLC_Misses +.Pq Event 2EH , Umask 41H +The number of cache misses for references to the last level cache, +excluding misses due to hardware prefetches. +This is an architectural performance event. +.It Li LLC_Reference +The number of references to the last level cache, +excluding those due to hardware prefetches. +This is an architectural performance event. +.Pq Event 2EH , Umask 4FH +This is an architectural performance event. +.It Li MMX_Assist +.Pq Event CDH , Umask 00H +The number of EMMX instructions executed. +.It Li MMX_FP_Trans +.Pq Event CCH , Umask 00H +The number of transitions from MMX to X87. +.It Li MMX_Instr_Exec +.Pq Event B0H , Umask 00H +The number of MMX instructions executed excluding +.Li MOVQ +and +.Li MOVD +stores. +.It Li MMX_Instr_Ret +.Pq Event CEH , Umask 00H +The number of MMX instructions retired. +.It Li Misalign_Mem_Ref +.Pq Event 05H , Umask 00H +The number of misaligned data memory references, counting loads and +stores. +.It Li Mul +.Pq Event 12H , Umask 00H +The number of multiply operations include speculative floating point +and integer multiplies. +This event is available on PMC1 only. +.It Li NonHlt_Ref_Cycles +.Pq Event 3CH , Umask 01H +.Pq Alias Qq "Unhalted Reference Cycles" +The number of non-halted bus cycles. +This is an architectural performance event. +.It Li Pref_Rqsts_Dn +.Pq Event F8H , Umask 00H +The number of hardware prefetch requests issued in backward streams. +.It Li Pref_Rqsts_Up +.Pq Event F0H , Umask 00H +The number of hardware prefetch requests issued in forward streams. +.It Li Resource_Stall +.Pq Event A2H , Umask 00H +The number of cycles where there is a resource related stall. +.It Li SD_Drains +.Pq Event 04H , Umask 00H +The number of cycles while draining store buffers. +.It Li SIMD_FP_DP_P_Ret +.Pq Event D8H , Umask 02H +The number of SSE/SSE2 packed double precision instructions retired. +.It Li SIMD_FP_DP_P_Comp_Ret +.Pq Event D9H , Umask 02H +The number of SSE/SSE2 packed double precision compute instructions +retired. +.It Li SIMD_FP_DP_S_Ret +.Pq Event D8H , Umask 03H +The number of SSE/SSE2 scalar double precision instructions retired. +.It Li SIMD_FP_DP_S_Comp_Ret +.Pq Event D9H , Umask 03H +The number of SSE/SSE2 scalar double precision compute instructions +retired. +.It Li SIMD_FP_SP_P_Comp_Ret +.Pq Event D9H , Umask 00H +The number of SSE/SSE2 packed single precision compute instructions +retired. +.It Li SIMD_FP_SP_Ret +.Pq Event D8H , Umask 00H +The number of SSE/SSE2 scalar single precision instructions retired, +both packed and scalar. +.It Li SIMD_FP_SP_S_Ret +.Pq Event D8H , Umask 01H +The number of SSE/SSE2 scalar single precision instructions retired. +.It Li SIMD_FP_SP_S_Comp_Ret +.Pq Event D9H , Umask 01H +The number of SSE/SSE2 single precision compute instructions retired. +.It Li SIMD_Int_128_Ret +.Pq Event D8H , Umask 04H +The number of SSE2 128-bit integer instructions retired. +.It Li SIMD_Int_Pari_Exec +.Pq Event B3H , Umask 20H +The number of SIMD integer packed arithmetic instructions executed. +.It Li SIMD_Int_Pck_Exec +.Pq Event B3H , Umask 04H +The number of SIMD integer pack operations instructions executed. +.It Li SIMD_Int_Plog_Exec +.Pq Event B3H , Umask 10H +The number of SIMD integer packed logical instructions executed. +.It Li SIMD_Int_Pmul_Exec +.Pq Event B3H , Umask 01H +The number of SIMD integer packed multiply instructions executed. +.It Li SIMD_Int_Psft_Exec +.Pq Event B3H , Umask 02H +The number of SIMD integer packed shift instructions executed. +.It Li SIMD_Int_Sat_Exec +.Pq Event B1H , Umask 00H +The number of SIMD integer saturating instructions executed. +.It Li SIMD_Int_Upck_Exec +.Pq Event B3H , Umask 08H +The number of SIMD integer unpack instructions executed. +.It Li SMC_Detected +.Pq Event C3H , Umask 00H +The number of times self-modifying code was detected. +.It Li SSE_NTStores_Miss +.Pq Event 4BH , Umask 03H +The number of times an SSE streaming store instruction missed all caches. +.It Li SSE_NTStores_Ret +.Pq Event 07H , Umask 03H +The number of SSE streaming store instructions executed. +.It Li SSE_PrefNta_Miss +.Pq Event 4BH , Umask 00H +The number of times +.Li PREFETCHNTA +missed all caches. +.It Li SSE_PrefNta_Ret +.Pq Event 07H , Umask 00H +The number of +.Li PREFETCHNTA +instructions retired. +.It Li SSE_PrefT1_Miss +.Pq Event 4BH , Umask 01H +The number of times +.Li PREFETCHT1 +missed all caches. +.It Li SSE_PrefT1_Ret +.Pq Event 07H , Umask 01H +The number of +.Li PREFETCHT1 +instructions retired. +.It Li SSE_PrefT2_Miss +.Pq Event 4BH , Umask 02H +The number of times +.Li PREFETCHNT2 +missed all caches. +.It Li SSE_PrefT2_Ret +.Pq Event 07H , Umask 02H +The number of +.Li PREFETCHT2 +instructions retired. +.It Li Seg_Reg_Loads +.Pq Event 06H , Umask 00H +The number of segment register loads. +.It Li Serial_Execution_Cycles +.Pq Event 3CH , Umask 02H +The number of non-halted bus cycles of this code while the other core +was halted. +.It Li Thermal_Trip +.Pq Event 3BH , Umask C0H +The duration in a thermal trip based on the current core clock. +.It Li Unfusion +.Pq Event DBH , Umask 00H +The number of unfusion events. +.It Li Unhalted_Core_Cycles +.Pq Event 3CH , Umask 00H +The number of core clock cycles when the clock signal on a specific +core is not halted. +This is an architectural performance event. +.It Li Uops_Ret +.Pq Event C2H , Umask 00H +The number of micro-ops retired. +.El +.Ss Event Name Aliases +The following table shows the mapping between the PMC-independent +aliases supported by +.Lb libpmc +and the underlying hardware events used. +.Bl -column "branch-mispredicts" "Description" +.It Em Alias Ta Em Event +.It Li branches Ta Li Br_Instr_Ret +.It Li branch-mispredicts Ta Li Br_MisPred_Ret +.It Li dc-misses Ta (unsupported) +.It Li ic-misses Ta Li ICache_Misses +.It Li instructions Ta Li Instr_Ret +.It Li interrupts Ta Li HW_Int_Rx +.It Li unhalted-cycles Ta (unsupported) +.El +.Sh PROCESSOR ERRATA +The following errata affect performance measurement on these +processors. +These errata are documented in +.Rs +.%B Specification Update +.%T Intel\(rg CoreTM Duo Processor and Intel\(rg CoreTM Solo Processor on 65 nm Process +.%N Order Number 309222-017 +.%D July 2008 +.%Q Intel Corporation +.Re +.Bl -tag -width indent -compact +.It AE19 +Data prefetch performance monitoring events can only be enabled +on a single core. +.It AE25 +Performance monitoring counters that count external bus events +may report incorrect values after processor power state transitions. +.It AE28 +Performance monitoring events for retired floating point operations +(C1H) may not be accurate. +.It AE29 +DR3 address match on MOVD/MOVQ/MOVNTQ memory store +instruction may incorrectly increment performance monitoring count +for saturating SIMD instructions retired (Event CFH). +.It AE33 +Hardware prefetch performance monitoring events may be counted +inaccurately. +.It AE36 +The +.Li CPU_CLK_UNHALTED +performance monitoring event (Event 3CH) counts +clocks when the processor is in the C1/C2 processor power states. +.It AE39 +Certain performance monitoring counters related to bus, L2 cache +and power management are inaccurate. +.It AE51 +Performance monitoring events for retired instructions (Event C0H) may +not be accurate. +.It AE67 +Performance monitoring event +.Li FP_ASSIST +may not be accurate. +.It AE78 +Performance monitoring event for hardware prefetch requests (Event +4EH) and hardware prefetch request cache misses (Event 4FH) may not be +accurate. +.It AE82 +Performance monitoring event +.Li FP_MMX_TRANS_TO_MMX +may not count some transitions. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core2 3 , +.Xr pmc.iaf 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmc.tsc 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . diff --git a/lib/libpmc/pmc.core2.3 b/lib/libpmc/pmc.core2.3 new file mode 100644 index 0000000..3dbc0c8 --- /dev/null +++ b/lib/libpmc/pmc.core2.3 @@ -0,0 +1,1124 @@ +.\" Copyright (c) 2008,2009 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd June 8, 2009 +.Dt PMC.CORE2 3 +.Os +.Sh NAME +.Nm pmc.core2 +.Nd measurement events for +.Tn Intel +.Tn Core2 +family CPUs +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +.Tn Intel +.Tn "Core2" +CPUs contain PMCs conforming to version 2 of the +.Tn Intel +performance measurement architecture. +These CPUs may contain up to two classes of PMCs: +.Bl -tag -width "Li PMC_CLASS_IAP" +.It Li PMC_CLASS_IAF +Fixed-function counters that count only one hardware event per counter. +.It Li PMC_CLASS_IAP +Programmable counters that may be configured to count one of a defined +set of hardware events. +.El +.Pp +The number of PMCs available in each class and their widths need to be +determined at run time by calling +.Xr pmc_cpuinfo 3 . +.Pp +Intel Core2 PMCs are documented in +.Rs +.%B "IA-32 Intel(R) Architecture Software Developer's Manual" +.%T "Volume 3: System Programming Guide" +.%N "Order Number 253669-027US" +.%D July 2008 +.%Q "Intel Corporation" +.Re +.Ss CORE2 FIXED FUNCTION PMCS +These PMCs and their supported events are documented in +.Xr pmc.iaf 3 . +Not all CPUs in this family implement fixed-function counters. +.Ss CORE2 PROGRAMMABLE PMCS +The programmable PMCs support the following capabilities: +.Bl -column "PMC_CAP_INTERRUPT" "Support" +.It Em Capability Ta Em Support +.It PMC_CAP_CASCADE Ta \&No +.It PMC_CAP_EDGE Ta Yes +.It PMC_CAP_INTERRUPT Ta Yes +.It PMC_CAP_INVERT Ta Yes +.It PMC_CAP_READ Ta Yes +.It PMC_CAP_PRECISE Ta \&No +.It PMC_CAP_SYSTEM Ta Yes +.It PMC_CAP_TAGGING Ta \&No +.It PMC_CAP_THRESHOLD Ta Yes +.It PMC_CAP_USER Ta Yes +.It PMC_CAP_WRITE Ta Yes +.El +.Ss Event Qualifiers +Event specifiers for these PMCs support the following common +qualifiers: +.Bl -tag -width indent +.It Li cmask= Ns Ar value +Configure the PMC to increment only if the number of configured +events measured in a cycle is greater than or equal to +.Ar value . +.It Li edge +Configure the PMC to count the number of de-asserted to asserted +transitions of the conditions expressed by the other qualifiers. +If specified, the counter will increment only once whenever a +condition becomes true, irrespective of the number of clocks during +which the condition remains true. +.It Li inv +Invert the sense of comparison when the +.Dq Li cmask +qualifier is present, making the counter increment when the number of +events per cycle is less than the value specified by the +.Dq Li cmask +qualifier. +.It Li os +Configure the PMC to count events happening at processor privilege +level 0. +.It Li usr +Configure the PMC to count events occurring at privilege levels 1, 2 +or 3. +.El +.Pp +If neither of the +.Dq Li os +or +.Dq Li usr +qualifiers are specified, the default is to enable both. +.Pp +Events that require core-specificity to be specified use a +additional qualifier +.Dq Li core= Ns Ar core , +where argument +.Ar core +is one of: +.Bl -tag -width indent +.It Li all +Measure event conditions on all cores. +.It Li this +Measure event conditions on this core. +.El +.Pp +The default is +.Dq Li this . +.Pp +Events that require an agent qualifier to be specified use an +additional qualifier +.Dq Li agent= Ns agent , +where argument +.Ar agent +is one of: +.Bl -tag -width indent +.It Li this +Measure events associated with this bus agent. +.It Li any +Measure events caused by any bus agent. +.El +.Pp +The default is +.Dq Li this . +.Pp +Events that require a hardware prefetch qualifier to be specified use an +additional qualifier +.Dq Li prefetch= Ns Ar prefetch , +where argument +.Ar prefetch +is one of: +.Bl -tag -width "exclude" +.It Li both +Include all prefetches. +.It Li only +Only count hardware prefetches. +.It Li exclude +Exclude hardware prefetches. +.El +.Pp +The default is +.Dq Li both . +.Pp +Events that require a cache coherence qualifier to be specified use an +additional qualifier +.Dq Li cachestate= Ns Ar state , +where argument +.Ar state +contains one or more of the following letters: +.Bl -tag -width indent +.It Li e +Count cache lines in the exclusive state. +.It Li i +Count cache lines in the invalid state. +.It Li m +Count cache lines in the modified state. +.It Li s +Count cache lines in the shared state. +.El +.Pp +The default is +.Dq Li eims . +.Pp +Events that require a snoop response qualifier to be specified use an +additional qualifier +.Dq Li snoopresponse= Ns Ar response , +where argument +.Ar response +comprises of the following keywords separated by +.Dq + +signs: +.Bl -tag -width indent +.It Li clean +Measure CLEAN responses. +.It Li hit +Measure HIT responses. +.It Li hitm +Measure HITM responses. +.El +.Pp +The default is to measure all the above responses. +.Pp +Events that require a snoop type qualifier use an additional qualifier +.Dq Li snooptype= Ns Ar type , +where argument +.Ar type +comprises the one of the following keywords: +.Bl -tag -width indent +.It Li cmp2i +Measure CMP2I snoops. +.It Li cmp2s +Measure CMP2S snoops. +.El +.Pp +The default is to measure both snoops. +.Ss Event Specifiers (Programmable PMCs) +Core2 programmable PMCs support the following events: +.Bl -tag -width indent +.It Li BACLEARS +.Pq Event E6H , Umask 00H +The number of times the front end is resteered. +.It Li BOGUS_BR +.Pq Event E4H , Umask 00H +The number of byte sequences mistakenly detected as taken branch +instructions. +.It Li BR_BAC_MISSP_EXEC +.Pq Event 8AH , Umask 00H +The number of branch instructions that were mispredicted when +decoded. +.It Li BR_CALL_MISSP_EXEC +.Pq Event 93H , Umask 00H +The number of mispredicted +.Li CALL +instructions that were executed. +.It Li BR_CALL_EXEC +.Pq Event 92H , Umask 00H +The number of +.Li CALL +instructions executed. +.It Li BR_CND_EXEC +.Pq Event 8BH , Umask 00H +The number of conditional branches executed, but not necessarily retired. +.It Li BR_CND_MISSP_EXEC +.Pq Event 8CH , Umask 00H +The number of mispredicted conditional branches executed. +.It Li BR_IND_CALL_EXEC +.Pq Event 94H , Umask 00H +The number of indirect +.Li CALL +instructions executed. +.It Li BR_IND_EXEC +.Pq Event 8DH , Umask 00H +The number of indirect branch instructions executed. +.It Li BR_IND_MISSP_EXEC +.Pq Event 8EH , Umask 00H +The number of mispredicted indirect branch instructions executed. +.It Li BR_INST_DECODED +.Pq Event E0H , Umask 00H +The number of branch instructions decoded. +.It Li BR_INST_EXEC +.Pq Event 88H , Umask 00H +The number of branches executed, but not necessarily retired. +.It Li BR_INST_RETIRED.ANY +.Pq Event C4H , Umask 00H +.Pq Alias Qq "Branch Instruction Retired" +The number of branch instructions retired. +This is an architectural performance event. +.It Li BR_INST_RETIRED.MISPRED +.Pq Event C5H , Umask 00H +.Pq Alias Qq "Branch Misses Retired" +The number of mispredicted branch instructions retired. +This is an architectural performance event. +.It Li BR_INST_RETIRED.MISPRED_NOT_TAKEN +.Pq Event C4H , Umask 02H +The number of not taken branch instructions retired that were +mispredicted. +.It Li BR_INST_RETIRED.MISPRED_TAKEN +.Pq Event C4H , Umask 08H +The number taken branch instructions retired that were mispredicted. +.It Li BR_INST_RETIRED.PRED_NOT_TAKEN +.Pq Event C4H , Umask 01H +The number of not taken branch instructions retired that were +correctly predicted. +.It Li BR_INST_RETIRED.PRED_TAKEN +.Pq Event C4H , Umask 04H +The number of taken branch instructions retired that were correctly +predicted. +.It Li BR_INST_RETIRED.TAKEN +.Pq Event C4H , Umask 0CH +The number of taken branch instructions retired. +.It Li BR_MISSP_EXEC +.Pq Event 89H , Umask 00H +The number of mispredicted branch instructions that were executed. +.It Li BR_RET_MISSP_EXEC +.Pq Event 90H , Umask 00H +The number of mispredicted +.Li RET +instructions executed. +.It Li BR_RET_BAC_MISSP_EXEC +.Pq Event 91H , Umask 00H +The number of +.Li RET +instructions executed that were mispredicted at decode time. +.It Li BR_RET_EXEC +.Pq Event 8FH , Umask 00H +The number of +.Li RET +instructions executed. +.It Li BR_TKN_BUBBLE_1 +.Pq Event 97H , Umask 00H +The number of branch predicted taken with bubble 1. +.It Li BR_TKN_BUBBLE_2 +.Pq Event 98H , Umask 00H +The number of branch predicted taken with bubble 2. +.It Li BUSQ_EMPTY Op ,core= Ns Ar core +.Pq Event 7DH +The number of cycles during which the core did not have any pending +transactions in the bus queue. +.It Li BUS_BNR_DRV Op ,agent= Ns Ar agent +.Pq Event 61H +The number of Bus Not Ready signals asserted on the bus. +.It Li BUS_DATA_RCV Op ,core= Ns Ar core +.Pq Event 64H +The number of bus cycles during which the processor is receiving data. +.It Li BUS_DRDY_CLOCKS Op ,agent= Ns Ar agent +.Pq Event 62H +The number of bus cycles during which the Data Ready signal is asserted +on the bus. +.It Li BUS_HIT_DRV Op ,agent= Ns Ar agent +.Pq Event 7AH +The number of bus cycles during which the processor drives the +.Li HIT# +pin. +.It Li BUS_HITM_DRV Op ,agent= Ns Ar agent +.Pq Event 7BH +The number of bus cycles during which the processor drives the +.Li HITM# +pin. +.It Li BUS_IO_WAIT Op ,core= Ns Ar core +.Pq Event 7FH +The number of core cycles during which I/O requests wait in the bus +queue. +.It Li BUS_LOCK_CLOCKS Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 63H +The number of bus cycles during which the +.Li LOCK +signal was asserted on the bus. +.It Li BUS_REQUEST_OUTSTANDING Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 60H +The number of pending full cache line read transactions on the bus +occurring in each cycle. +.It Li BUS_TRANS_P Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 6BH +The number of partial bus transactions. +.It Li BUS_TRANS_IFETCH Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 68H +The number of instruction fetch full cache line bus transactions. +.It Li BUS_TRANS_INVAL Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 69H +The number of invalidate bus transactions. +.It Li BUS_TRANS_PWR Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 6AH +The number of partial write bus transactions. +.It Li BUS_TRANS_DEF Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 6DH +The number of deferred bus transactions. +.It Li BUS_TRANS_BURST Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 6EH +The number of burst transactions. +.It Li BUS_TRANS_MEM Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 6FH +The number of memory bus transactions. +.It Li BUS_TRANS_ANY Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 70H +The number of bus transactions of any kind. +.It Li BUS_TRANS_BRD Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 65H +The number of burst read transactions. +.It Li BUS_TRANS_IO Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 6CH +The number of completed I/O bus transactions due to +.Li IN +and +.Li OUT +instructions. +.It Li BUS_TRANS_RFO Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 66H +The number of Read For Ownership bus transactions. +.It Li BUS_TRANS_WB Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 67H +The number explicit write-back bus transactions due to dirty line +evictions. +.It Li CMP_SNOOP Xo +.Op ,core= Ns Ar core +.Op ,snooptype= Ns Ar snoop +.Xc +.Pq Event 78H +The number of times the L1 data cache is snooped by the other core in +the same processor. +.It Li CPU_CLK_UNHALTED.BUS +.Pq Event 3CH , Umask 01H +.Pq Alias Qq "Unhalted Reference Cycles" +The number of bus cycles when the core is not in the halt state. +This is an architectural performance event. +.It Li CPU_CLK_UNHALTED.CORE_P +.Pq Event 3CH , Umask 00H +.Pq Alias Qq "Unhalted Core Cycles" +The number of core cycles while the core is not in a halt state. +This is an architectural performance event. +.It Li CPU_CLK_UNHALTED.NO_OTHER +.Pq Event 3CH , Umask 02H +The number of bus cycles during which the core remains unhalted and +the other core is halted. +.It Li CYCLES_DIV_BUSY +.Pq Event 14H , Umask 00H +The number of cycles the divider is busy. +This event is only available on PMC0. +.It Li CYCLES_INT_MASKED +.Pq Event C6H , Umask 01H +The number of cycles during which interrupts are disabled. +.It Li CYCLES_INT_PENDING_AND_MASKED +.Pq Event C6H , Umask 02H +The number of cycles during which there were pending interrupts while +interrupts were disabled. +.It Li CYCLES_L1I_MEM_STALLED +.Pq Event 86H , Umask 00H +The number of cycles for which an instruction fetch stalls. +.It Li DELAYED_BYPASS.FP +.Pq Event 19H , Umask 00H +The number of floating point operations that used data immediately +after the data was generated by a non floating point execution unit. +.It Li DELAYED_BYPASS.LOAD +.Pq Event 19H , Umask 01H +The number of delayed bypass penalty cycles that a load operation incurred. +.It Li DELAYED_BYPASS.SIMD +.Pq Event 19H , Umask 02H +The number of times SIMD operations use data immediately after data, +was generated by a non-SIMD execution unit. +.It Li DIV +.Pq Event 13H , Umask 00H +The number of divide operations executed. +This event is only available on PMC1. +.It Li DTLB_MISSES.ANY +.Pq Event 08H , Umask 01H +The number of Data TLB misses, including misses that result from +speculative accesses. +.It Li DTLB_MISSES.L0_MISS_LD +.Pq Event 08H , Umask 04H +The number of level 0 DTLB misses due to load operations. +.It Li DTLB_MISSES.MISS_LD +.Pq Event 08H , Umask 02H +The number of Data TLB misses due to load operations. +.It Li DTLB_MISSES.MISS_ST +.Pq Event 08H , Umask 08H +The number of Data TLB misses due to store operations. +.It Li EIST_TRANS +.Pq Event 3AH , Umask 00H +The number of Enhanced Intel SpeedStep Technology transitions. +.It Li ESP.ADDITIONS +.Pq Event ABH , Umask 02H +The number of automatic additions to the +.Li %esp +register. +.It Li ESP.SYNCH +.Pq Event ABH , Umask 01H +The number of times the +.Li %esp +register was explicitly used in an address expression after +it is implicitly used by a +.Li PUSH +or +.Li POP +instruction. +.It Li EXT_SNOOP Xo +.Op ,agent= Ns Ar agent +.Op ,snoopresponse= Ns Ar response +.Xc +.Pq Event 77H +The number of snoop responses to bus transactions. +.It Li FP_ASSIST +.Pq Event 11H , Umask 00H +The number of floating point operations executed that needed +a microcode assist. +.It Li FP_COMP_OPS_EXE +.Pq Event 10H , Umask 00H +The number of floating point computational micro-ops executed. +The event is available only on PMC0. +.It Li FP_MMX_TRANS_TO_FP +.Pq Event CCH , Umask 02H +The number of transitions from MMX instructions to floating point +instructions. +.It Li FP_MMX_TRANS_TO_MMX +.Pq Event CCH , Umask 01H +The number of transitions from floating point instructions to MMX +instructions. +.It Li HW_INT_RCV +.Pq Event C8H , Umask 00H +The number of hardware interrupts received. +.It Li IDLE_DURING_DIV +.Pq Event 18H , Umask 00H +The number of cycles the divider is busy and no other execution unit +or load operation was in progress. +This event is available only on PMC0. +.It Li ILD_STALL +.Pq Event 87H , Umask 00H +The number of cycles the instruction length decoder stalled due to a +length changing prefix. +.It Li INST_QUEUE.FULL +.Pq Event 83H , Umask 02H +The number of cycles during which the instruction queue is full. +.It Li INST_RETIRED.ANY_P +.Pq Event C0H , Umask 00H +.Pq Alias Qq "Instruction Retired" +The number of instructions retired. +This is an architectural performance event. +.It Li INST_RETIRED.LOADS +.Pq Event C0H , Umask 01H +The number of instructions retired that contained a load operation. +.It Li INST_RETIRED.OTHER +.Pq Event C0H , Umask 04H +The number of instructions retired that did not contain a load or a +store operation. +.It Li INST_RETIRED.STORES +.Pq Event C0H , Umask 02H +The number of instructions retired that contained a store operation. +.It Li INST_RETIRED.VM_H +.Pq Event C0H , Umask 08H +.Pq Tn Core2Extreme +The number of instructions retired while in VMX root operation. +.It Li ITLB.FLUSH +.Pq Event 82H , Umask 40H +The number of ITLB flushes. +.It Li ITLB.LARGE_MISS +.Pq Event 82H , Umask 10H +The number of instruction fetches from large pages that miss the +ITLB. +.It Li ITLB.MISSES +.Pq Event 82H , Umask 12H +The number of instruction fetches from both large and small pages that +miss the ITLB. +.It Li ITLB.SMALL_MISS +.Pq Event 82H , Umask 02H +The number of instruction fetches from small pages that miss the ITLB. +.It Li ITLB_MISS_RETIRED +.Pq Event C9H , Umask 00H +The number of retired instructions that missed the ITLB when they were +fetched. +.It Li L1D_ALL_REF +.Pq Event 43H , Umask 01H +The number of references to L1 data cache counting loads and stores of +to all memory types. +.It Li L1D_ALL_CACHE_REF +.Pq Event 43H , Umask 02H +The number of data reads and writes to cacheable memory. +.It Li L1D_CACHE_LOCK Op ,cachestate= Ns Ar state +.Pq Event 42H +The number of locked reads from cacheable memory. +.It Li L1D_CACHE_LOCK_DURATION +.Pq Event 42H , Umask 10H +The number of cycles during which any cache line is locked by any +locking instruction. +.It Li L1D_CACHE_LD Op ,cachestate= Ns Ar state +.Pq Event 40H +The number of data reads from cacheable memory excluding locked +reads. +.It Li L1D_CACHE_ST Op ,cachestate= Ns Ar state +.Pq Event 41H +The number of data writes to cacheable memory excluding locked +writes. +.It Li L1D_M_EVICT +.Pq Event 47H , Umask 00H +The number of modified cache lines evicted from L1 data cache. +.It Li L1D_M_REPL +.Pq Event 46H , Umask 00H +The number of modified lines allocated in L1 data cache. +.It Li L1D_PEND_MISS +.Pq Event 48H , Umask 00H +The total number of outstanding L1 data cache misses at any clock. +.It Li L1D_PREFETCH.REQUESTS +.Pq Event 4EH , Umask 10H +The number of times L1 data cache requested to prefetch a data cache +line. +.It Li L1D_REPL +.Pq Event 45H , Umask 0FH +The number of lines brought into L1 data cache. +.It Li L1D_SPLIT.LOADS +.Pq Event 49H , Umask 01H +The number of load operations that span two cache lines. +.It Li L1D_SPLIT.STORES +.Pq Event 49H , Umask 02H +The number of store operations that span two cache lines. +.It Li L1I_MISSES +.Pq Event 81H , Umask 00H +The number of instruction fetch unit misses. +.It Li L1I_READS +.Pq Event 80H , Umask 00H +The number of instruction fetches. +.It Li L2_ADS Op ,core= Ns core +.Pq Event 21H +The number of cycles that the L2 address bus is in use. +.It Li L2_DBUS_BUSY_RD Op ,core= Ns core +.Pq Event 23H +The number of cycles during which the L2 data bus is busy transferring +data to the core. +.It Li L2_IFETCH Xo +.Op ,cachestate= Ns Ar state +.Op ,core= Ns Ar core +.Xc +.Pq Event 28H +The number of instruction cache line requests from the instruction +fetch unit. +.It Li L2_LD Xo +.Op ,cachestate= Ns Ar state +.Op ,core= Ns Ar core +.Op ,prefetch= Ns Ar prefetch +.Xc +.Pq Event 29H +The number of L2 cache read requests from L1 cache and L2 +prefetchers. +.It Li L2_LINES_IN Xo +.Op ,core= Ns Ar core +.Op ,prefetch= Ns Ar prefetch +.Xc +.Pq Event 24H +The number of cache lines allocated in L2 cache. +.It Li L2_LINES_OUT Xo +.Op ,core= Ns Ar core +.Op ,prefetch= Ns Ar prefetch +.Xc +.Pq Event 26H +The number of L2 cache lines evicted. +.It Li L2_LOCK Xo +.Op ,cachestate= Ns Ar state +.Op ,core= Ns Ar core +.Xc +.Pq Event 2BH +The number of locked accesses to cache lines that miss L1 data +cache. +.It Li L2_M_LINES_IN Op ,core= Ns Ar core +.Pq Event 25H +The number of L2 cache line modifications. +.It Li L2_M_LINES_OUT Xo +.Op ,core= Ns Ar core +.Op ,prefetch= Ns Ar prefetch +.Xc +.Pq Event 27H +The number of modified lines evicted from L2 cache. +.It Li L2_NO_REQ Op ,core= Ns Ar core +.Pq Event 32H +The number of cycles during which no L2 cache requests were pending +from a core. +.It Li L2_REJECT_BUSQ Xo +.Op ,cachestate= Ns Ar state +.Op ,core= Ns Ar core +.Op ,prefetch= Ns Ar prefetch +.Xc +.Pq Event 30H +The number of L2 cache requests that were rejected. +.It Li L2_RQSTS Xo +.Op ,cachestate= Ns Ar state +.Op ,core= Ns Ar core +.Op ,prefetch= Ns Ar prefetch +.Xc +.Pq Event 2EH +The number of completed L2 cache requests. +.It Li L2_RQSTS.SELF.DEMAND.I_STATE +.Pq Event 2EH , Umask 41H +.Pq Alias Qq "LLC Misses" +The number of completed L2 cache demand requests from this core that +missed the L2 cache. +This is an architectural performance event. +.It Li L2_RQSTS.SELF.DEMAND.MESI +.Pq Event 2EH , Umask 4FH +.Pq Alias Qq "LLC References" +The number of completed L2 cache demand requests from this core. +This is an architectural performance event. +.It Li L2_ST Xo +.Op ,cachestate= Ns Ar state +.Op ,core= Ns Ar core +.Xc +.Pq Event 2AH +The number of store operations that miss the L1 cache and request data +from the L2 cache. +.It Li LOAD_BLOCK.L1D +.Pq Event 03H , Umask 20H +The number of loads blocked by the L1 data cache. +.It Li LOAD_BLOCK.OVERLAP_STORE +.Pq Event 03H , Umask 08H +The number of loads that partially overlap an earlier store or are +aliased with a previous store. +.It Li LOAD_BLOCK.STA +.Pq Event 03H , Umask 02H +The number of loads blocked by preceding stores whose address is yet +to be calculated. +.It Li LOAD_BLOCK.STD +.Pq Event 03H , Umask 04H +The number of loads blocked by preceding stores to the same address +whose data value is not known. +.It Li LOAD_BLOCK.UNTIL_RETIRE +.Pq Event 03H , Umask 10H +The number of load operations that were blocked until retirement. +.It Li LOAD_HIT_PRE +.Pq Event 4CH , Umask 00H +The number of load operations that conflicted with an prefetch to the +same cache line. +.It Li MACHINE_NUKES.SMC +.Pq Event C3H , Umask 01H +The number of times a program writes to a code section. +.It Li MACHINE_NUKES.MEM_ORDER +.Pq Event C3H , Umask 04H +The number of times the execution pipeline was restarted due to a +memory ordering conflict or memory disambiguation misprediction. +.It Li MACRO_INSTS.CISC_DECODED +.Pq Event AAH , Umask 08H +The number of complex instructions decoded. +.It Li MACRO_INSTS.DECODED +.Pq Event AAH , Umask 01H +The number of instructions decoded. +.It Li MEMORY_DISAMBIGUATION.RESET +.Pq Event 09H , Umask 01H +The number of cycles during which memory disambiguation misprediction +occurs. +.It Li MEMORY_DISAMBIGUATION.SUCCESS +.Pq Event 09H , Umask 02H +The number of load operations that were successfully disambiguated. +.It Li MEM_LOAD_RETIRED.DTLB_MISS +.Pq Event CBH , Umask 10H +The number of retired loads that missed the DTLB. +.It Li MEM_LOAD_RETIRED.L1D_LINE_MISS +.Pq Event CBH , Umask 02H +The number of retired load operations that missed L1 data cache and +that sent a request to L2 cache. +This event is only available on PMC0. +.It Li MEM_LOAD_RETIRED.L1D_MISS +.Pq Event CBH , Umask 01H +The number of retired load operations that missed L1 data cache. +This event is only available on PMC0. +.It Li MEM_LOAD_RETIRED.L2_LINE_MISS +.Pq Event CBH , Umask 08H +The number of load operations that missed L2 cache and that caused a +bus request. +.It Li MEM_LOAD_RETIRED.L2_MISS +.Pq Event CBH , Umask 04H +The number of load operations that missed L2 cache. +.It Li MUL +.Pq Event 12H , Umask 00H +The number of multiply operations executed. +This event is only available on PMC1. +.It Li PAGE_WALKS.COUNT +.Pq Event 0CH , Umask 01H +The number of page walks executed due to an ITLB or DTLB miss. +.It Li PAGE_WALKS.CYCLES +.Pq Event 0CH , Umask 02H +The number of cycles spent in a page walk caused by an ITLB or DTLB +miss. +.It Li PREF_RQSTS_DN +.Pq Event F8H , Umask 00H +The number of downward prefetches issued from the Data Prefetch Logic +unit to L2 cache. +.It Li PREF_RQSTS_UP +.Pq Event F0H , Umask 00H +The number of upward prefetches issued from the Data Prefetch Logic +unit to L2 cache. +.It Li RAT_STALLS.ANY +.Pq Event D2H , Umask 0FH +The number of stall cycles due to any of +.Li RAT_STALLS.FLAGS +.Li RAT_STALLS.FPSW , +.Li RAT_STALLS.PARTIAL +and +.Li RAT_STALLS.ROB_READ_PORT . +.It Li RAT_STALLS.FLAGS +.Pq Event D2H , Umask 04H +The number of cycles execution stalled due to a flag register induced +stall. +.It Li RAT_STALLS.FPSW +.Pq Event D2H , Umask 08H +The number of times the floating point status word was written. +.It Li RAT_STALLS.OTHER_SERIALIZATION_STALLS +.Pq Event D2H , Umask 10H , Tn Core2Extreme +The number of stalls due to other RAT resource serialization not +counted by umask 0FH. +.It Li RAT_STALLS.PARTIAL_CYCLES +.Pq Event D2H , Umask 02H +The number of cycles of added instruction execution latency due to the +use of a register that was partially written by previous instructions. +.It Li RAT_STALLS.ROB_READ_PORT +.Pq Event D2H , Umask 01H +The number of cycles when ROB read port stalls occurred. +.It Li RESOURCE_STALLS.ANY +.Pq Event DCH , Umask 1FH +The number of cycles during which any resource related stall +occurred. +.It Li RESOURCE_STALLS.BR_MISS_CLEAR +.Pq Event DCH , Umask 10H +The number of cycles stalled due to branch misprediction. +.It Li RESOURCE_STALLS.FPCW +.Pq Event DCH , Umask 08H +The number of cycles stalled due to writing the floating point control +word. +.It Li RESOURCE_STALLS.LD_ST +.Pq Event DCH , Umask 04H +The number of cycles during which the number of loads and stores in +the pipeline exceeded their limits. +.It Li RESOURCE_STALLS.ROB_FULL +.Pq Event DCH , Umask 01H +The number of cycles when the reorder buffer was full. +.It Li RESOURCE_STALLS.RS_FULL +.Pq Event DCH , Umask 02H +The number of cycles during which the RS was full. +.It Li RS_UOPS_DISPATCHED +.Pq Event A0H , Umask 00H +The number of micro-ops dispatched for execution. +.It Li RS_UOPS_DISPATCHED.PORT0 +.Pq Event A1H , Umask 01H +The number of cycles micro-ops were dispatched for execution on port +0. +.It Li RS_UOPS_DISPATCHED.PORT1 +.Pq Event A1H , Umask 02H +The number of cycles micro-ops were dispatched for execution on port +1. +.It Li RS_UOPS_DISPATCHED.PORT2 +.Pq Event A1H , Umask 04H +The number of cycles micro-ops were dispatched for execution on port +2. +.It Li RS_UOPS_DISPATCHED.PORT3 +.Pq Event A1H , Umask 08H +The number of cycles micro-ops were dispatched for execution on port +3. +.It Li RS_UOPS_DISPATCHED.PORT4 +.Pq Event A1H , Umask 10H +The number of cycles micro-ops were dispatched for execution on port +4. +.It Li RS_UOPS_DISPATCHED.PORT5 +.Pq Event A1H , Umask 20H +The number of cycles micro-ops were dispatched for execution on port +5. +.It Li SB_DRAIN_CYCLES +.Pq Event 04H , Umask 01H +The number of cycles while the store buffer is draining. +.It Li SEGMENT_REG_LOADS +.Pq Event 06H , Umask 00H +The number of segment register loads. +.It Li SEG_REG_RENAMES.ANY +.Pq Event D5H , Umask 0FH +The number of times the any segment register was renamed. +.It Li SEG_REG_RENAMES.DS +.Pq Event D5H , Umask 02H +The number of times the +.Li %ds +register is renamed. +.It Li SEG_REG_RENAMES.ES +.Pq Event D5H , Umask 01H +The number of times the +.Li %es +register is renamed. +.It Li SEG_REG_RENAMES.FS +.Pq Event D5H , Umask 04H +The number of times the +.Li %fs +register is renamed. +.It Li SEG_REG_RENAMES.GS +.Pq Event D5H , Umask 08H +The number of times the +.Li %gs +register is renamed. +.It Li SEG_RENAME_STALLS.ANY +.Pq Event D4H , Umask 0FH +The number of stalls due to lack of resource to rename any segment +register. +.It Li SEG_RENAME_STALLS.DS +.Pq Event D4H , Umask 02H +The number of stalls due to lack of renaming resources for the +.Li %ds +register. +.It Li SEG_RENAME_STALLS.ES +.Pq Event D4H , Umask 01H +The number of stalls due to lack of renaming resources for the +.Li %es +register. +.It Li SEG_RENAME_STALLS.FS +.Pq Event D4H , Umask 04H +The number of stalls due to lack of renaming resources for the +.Li %fs +register. +.It Li SEG_RENAME_STALLS.GS +.Pq Event D4H , Umask 08H +The number of stalls due to lack of renaming resources for the +.Li %gs +register. +.It Li SIMD_ASSIST +.Pq Event CDH , Umask 00H +The number SIMD assists invoked. +.It Li SIMD_COMP_INST_RETIRED.PACKED_DOUBLE +.Pq Event CAH , Umask 04H +Then number of computational SSE2 packed double precision instructions +retired. +.It Li SIMD_COMP_INST_RETIRED.PACKED_SINGLE +.Pq Event CAH , Umask 01H +Then number of computational SSE2 packed single precision instructions +retired. +.It Li SIMD_COMP_INST_RETIRED.SCALAR_DOUBLE +.Pq Event CAH , Umask 08H +Then number of computational SSE2 scalar double precision instructions +retired. +.It Li SIMD_COMP_INST_RETIRED.SCALAR_SINGLE +.Pq Event CAH , Umask 02H +Then number of computational SSE2 scalar single precision instructions +retired. +.It Li SIMD_INSTR_RETIRED +.Pq Event CEH , Umask 00H +The number of retired SIMD instructions that use MMX registers. +.It Li SIMD_INST_RETIRED.ANY +.Pq Event C7H , Umask 1FH +The number of streaming SIMD instructions retired. +.It Li SIMD_INST_RETIRED.PACKED_DOUBLE +.Pq Event C7H , Umask 04H +The number of SSE2 packed double precision instructions retired. +.It Li SIMD_INST_RETIRED.PACKED_SINGLE +.Pq Event C7H , Umask 01H +The number of SSE packed single precision instructions retired. +.It Li SIMD_INST_RETIRED.SCALAR_DOUBLE +.Pq Event C7H , Umask 08H +The number of SSE2 scalar double precision instructions retired. +.It Li SIMD_INST_RETIRED.SCALAR_SINGLE +.Pq Event C7H , Umask 02H +The number of SSE scalar single precision instructions retired. +.It Li SIMD_INST_RETIRED.VECTOR +.Pq Event C7H , Umask 10H +The number of SSE2 vector instructions retired. +.It Li SIMD_SAT_INSTR_RETIRED +.Pq Event CFH , Umask 00H +The number of saturated arithmetic SIMD instructions retired. +.It Li SIMD_SAT_UOP_EXEC +.Pq Event B1H , Umask 00H +The number of SIMD saturated arithmetic micro-ops executed. +.It Li SIMD_UOPS_EXEC +.Pq Event B0H , Umask 00H +The number of SIMD micro-ops executed. +.It Li SIMD_UOP_TYPE_EXEC.ARITHMETIC +.Pq Event B3H , Umask 20H +The number of SIMD packed arithmetic micro-ops executed. +.It Li SIMD_UOP_TYPE_EXEC.LOGICAL +.Pq Event B3H , Umask 10H +The number of SIMD packed logical micro-ops executed. +.It Li SIMD_UOP_TYPE_EXEC.MUL +.Pq Event B3H , Umask 01H +The number of SIMD packed multiply micro-ops executed. +.It Li SIMD_UOP_TYPE_EXEC.PACK +.Pq Event B3H , Umask 04H +The number of SIMD pack micro-ops executed. +.It Li SIMD_UOP_TYPE_EXEC.SHIFT +.Pq Event B3H , Umask 02H +The number of SIMD packed shift micro-ops executed. +.It Li SIMD_UOP_TYPE_EXEC.UNPACK +.Pq Event B3H , Umask 08H +The number of SIMD unpack micro-ops executed. +.It Li SNOOP_STALL_DRV Xo +.Op ,agent= Ns Ar agent +.Op ,core= Ns Ar core +.Xc +.Pq Event 7EH +The number of times the bus stalled for snoops. +.It Li SSE_PRE_EXEC.L1 +.Pq Event 07H , Umask 01H +The number of +.Li PREFETCHT0 +instructions executed. +.It Li SSE_PRE_EXEC.L2 +.Pq Event 07H , Umask 02H +The number of +.Li PREFETCHT1 +instructions executed. +.It Li SSE_PRE_EXEC.NTA +.Pq Event 07H , Umask 00H +The number of +.Li PREFETCHNTA +instructions executed. +.It Li SSE_PRE_EXEC.STORES +.Pq Event 07H , Umask 03H +The number of times SSE non-temporal store instructions were executed. +.It Li SSE_PRE_MISS.L1 +.Pq Event 4BH , Umask 01H +The number of times the +.Li PREFETCHT0 +instruction executed and missed all cache levels. +.It Li SSE_PRE_MISS.L2 +.Pq Event 4BH , Umask 02H +The number of times the +.Li PREFETCHT1 +instruction executed and missed all cache levels. +.It Li SSE_PRE_MISS.NTA +.Pq Event 4BH , Umask 00H +The number of times the +.Li PREFETCHNTA +instruction executed and missed all cache levels. +.It Li STORE_BLOCK.ORDER +.Pq Event 04H , Umask 02H +The number of cycles while a store was waiting for another store to be +globally observed. +.It Li STORE_BLOCK.SNOOP +.Pq Event 04H , Umask 08H +The number of cycles while a store was blocked due to a conflict with +an internal or external snoop. +.It Li THERMAL_TRIP +.Pq Event 3BH , Umask C0H +The number of thermal trips. +.It Li UOPS_RETIRED.LD_IND_BR +.Pq Event C2H , Umask 01H +The number of micro-ops retired that fused a load with another +operation. +.It Li UOPS_RETIRED.STD_STA +.Pq Event C2H , Umask 02H +The number of store address calculations that fused into one micro-op. +.It Li UOPS_RETIRED.MACRO_FUSION +.Pq Event C2H , Umask 04H +The number of times retired instruction pairs were fused into one +micro-op. +.It Li UOPS_RETIRED.FUSED +.Pq Event C2H , Umask 07H +The number of fused micro-ops retired. +.It Li UOPS_RETIRED.NON_FUSED +.Pq Event C2H , Umask 8H +The number of non-fused micro-ops retired. +.It Li UOPS_RETIRED.ANY +.Pq Event C2H , Umask 0FH +The number of micro-ops retired. +.It Li X87_OPS_RETIRED.ANY +.Pq Event C1H , Umask FEH +The number of floating point computational instructions retired. +.It Li X87_OPS_RETIRED.FXCH +.Pq Event C1H , Umask 01H +The number of +.Li FXCH +instructions retired. +.El +.Ss Event Name Aliases +The following table shows the mapping between the PMC-independent +aliases supported by +.Lb libpmc +and the underlying hardware events used. +.Bl -column "branch-mispredicts" "cpu_clk_unhalted.core_p" "PMC Class" +.It Em Alias Ta Em Event Ta Em PMC Class +.It Li branches Ta Li BR_INST_RETIRED.ANY Ta Li PMC_CLASS_IAP +.It Li branch-mispredicts Ta Li BR_INST_RETIRED.MISPRED Ta Li PMC_CLASS_IAP +.It Li ic-misses Ta Li L1I_MISSES Ta Li PMC_CLASS_IAP +.It Li instructions Ta Li INST_RETIRED.ANY_P Ta Li PMC_CLASS_IAF +.It Li interrupts Ta Li HW_INT_RCV Ta Li PMC_CLASS_IAP +.It Li unhalted-cycles Ta Li CPU_CLK_UNHALTED.CORE_P Ta Li PMC_CLASS_IAF +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.iaf 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmc.tsc 3 , +.Xr pmc_cpuinfo 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . diff --git a/lib/libpmc/pmc.corei7.3 b/lib/libpmc/pmc.corei7.3 new file mode 100644 index 0000000..679313f --- /dev/null +++ b/lib/libpmc/pmc.corei7.3 @@ -0,0 +1,1581 @@ +.\" Copyright (c) 2010 Fabien Thomas. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd March 24, 2010 +.Dt PMC.COREI7 3 +.Os +.Sh NAME +.Nm pmc.corei7 +.Nd measurement events for +.Tn Intel +.Tn Core i7 and Xeon 5500 +family CPUs +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +.Tn Intel +.Tn "Core i7" +CPUs contain PMCs conforming to version 2 of the +.Tn Intel +performance measurement architecture. +These CPUs may contain up to three classes of PMCs: +.Bl -tag -width "Li PMC_CLASS_IAP" +.It Li PMC_CLASS_IAF +Fixed-function counters that count only one hardware event per counter. +.It Li PMC_CLASS_IAP +Programmable counters that may be configured to count one of a defined +set of hardware events. +.El +.Pp +The number of PMCs available in each class and their widths need to be +determined at run time by calling +.Xr pmc_cpuinfo 3 . +.Pp +Intel Core i7 and Xeon 5500 PMCs are documented in +.Rs +.%B "Intel(R) 64 and IA-32 Architectures Software Developes Manual" +.%T "Volume 3B: System Programming Guide, Part 2" +.%N "Order Number: 253669-033US" +.%D December 2009 +.%Q "Intel Corporation" +.Re +.Ss COREI7 AND XEON 5500 FIXED FUNCTION PMCS +These PMCs and their supported events are documented in +.Xr pmc.iaf 3 . +Not all CPUs in this family implement fixed-function counters. +.Ss COREI7 AND XEON 5500 PROGRAMMABLE PMCS +The programmable PMCs support the following capabilities: +.Bl -column "PMC_CAP_INTERRUPT" "Support" +.It Em Capability Ta Em Support +.It PMC_CAP_CASCADE Ta \&No +.It PMC_CAP_EDGE Ta Yes +.It PMC_CAP_INTERRUPT Ta Yes +.It PMC_CAP_INVERT Ta Yes +.It PMC_CAP_READ Ta Yes +.It PMC_CAP_PRECISE Ta \&No +.It PMC_CAP_SYSTEM Ta Yes +.It PMC_CAP_TAGGING Ta \&No +.It PMC_CAP_THRESHOLD Ta Yes +.It PMC_CAP_USER Ta Yes +.It PMC_CAP_WRITE Ta Yes +.El +.Ss Event Qualifiers +Event specifiers for these PMCs support the following common +qualifiers: +.Bl -tag -width indent +.It Li rsp= Ns Ar value +Configure the Off-core Response bits. +.Bl -tag -width indent +.It Li DMND_DATA_RD +Counts the number of demand and DCU prefetch data reads of full +and partial cachelines as well as demand data page table entry +cacheline reads. Does not count L2 data read prefetches or +instruction fetches. +.It Li DMND_RFO +Counts the number of demand and DCU prefetch reads for ownership +(RFO) requests generated by a write to data cacheline. Does not +count L2 RFO. +.It Li DMND_IFETCH +Counts the number of demand and DCU prefetch instruction cacheline +reads. Does not count L2 code read prefetches. +WB +Counts the number of writeback (modified to exclusive) transactions. +.It Li PF_DATA_RD +Counts the number of data cacheline reads generated by L2 prefetchers. +.It Li PF_RFO +Counts the number of RFO requests generated by L2 prefetchers. +.It Li PF_IFETCH +Counts the number of code reads generated by L2 prefetchers. +.It Li OTHER +Counts one of the following transaction types, including L3 invalidate, +I/O, full or partial writes, WC or non-temporal stores, CLFLUSH, Fences, +lock, unlock, split lock. +.It Li UNCORE_HIT +L3 Hit: local or remote home requests that hit L3 cache in the uncore +with no coherency actions required (snooping). +.It Li OTHER_CORE_HIT_SNP +L3 Hit: local or remote home requests that hit L3 cache in the uncore +and was serviced by another core with a cross core snoop where no modified +copies were found (clean). +.It Li OTHER_CORE_HITM +L3 Hit: local or remote home requests that hit L3 cache in the uncore +and was serviced by another core with a cross core snoop where modified +copies were found (HITM). +.It Li REMOTE_CACHE_FWD +L3 Miss: local homed requests that missed the L3 cache and was serviced +by forwarded data following a cross package snoop where no modified +copies found. (Remote home requests are not counted) +.It Li REMOTE_DRAM +L3 Miss: remote home requests that missed the L3 cache and were serviced +by remote DRAM. +.It Li LOCAL_DRAM +L3 Miss: local home requests that missed the L3 cache and were serviced +by local DRAM. +.It Li NON_DRAM +Non-DRAM requests that were serviced by IOH. +.El +.It Li cmask= Ns Ar value +Configure the PMC to increment only if the number of configured +events measured in a cycle is greater than or equal to +.Ar value . +.It Li edge +Configure the PMC to count the number of de-asserted to asserted +transitions of the conditions expressed by the other qualifiers. +If specified, the counter will increment only once whenever a +condition becomes true, irrespective of the number of clocks during +which the condition remains true. +.It Li inv +Invert the sense of comparison when the +.Dq Li cmask +qualifier is present, making the counter increment when the number of +events per cycle is less than the value specified by the +.Dq Li cmask +qualifier. +.It Li os +Configure the PMC to count events happening at processor privilege +level 0. +.It Li usr +Configure the PMC to count events occurring at privilege levels 1, 2 +or 3. +.El +.Pp +If neither of the +.Dq Li os +or +.Dq Li usr +qualifiers are specified, the default is to enable both. +.Ss Event Specifiers (Programmable PMCs) +Core i7 and Xeon 5500 programmable PMCs support the following events: +.Bl -tag -width indent +.It Li SB_DRAIN.ANY +.Pq Event 04H , Umask 07H +Counts the number of store buffer drains. +.It Li STORE_BLOCKS.AT_RET +.Pq Event 06H , Umask 04H +Counts number of loads delayed with at-Retirement block code. The following +loads need to be executed at retirement and wait for all senior stores on +the same thread to be drained: load splitting across 4K boundary (page +split), load accessing uncacheable (UC or USWC) memory, load lock, and load +with page table in UC or USWC memory region. +.It Li STORE_BLOCKS.L1D_BLOCK +.Pq Event 06H , Umask 08H +Cacheable loads delayed with L1D block code +.It Li PARTIAL_ADDRESS_ALIAS +.Pq Event 07H , Umask 01H +Counts false dependency due to partial address aliasing +.It Li DTLB_LOAD_MISSES.ANY +.Pq Event 08H , Umask 01H +Counts all load misses that cause a page walk +.It Li DTLB_LOAD_MISSES.WALK_COMPLETED +.Pq Event 08H , Umask 02H +Counts number of completed page walks due to load miss in the STLB. +.It Li DTLB_LOAD_MISSES.STLB_HIT +.Pq Event 08H , Umask 10H +Number of cache load STLB hits +.It Li DTLB_LOAD_MISSES.PDE_MISS +.Pq Event 08H , Umask 20H +Number of DTLB cache load misses where the low part of the linear to +physical address translation was missed. +.It Li DTLB_LOAD_MISSES.PDP_MISS +.Pq Event 08H , Umask 40H +Number of DTLB cache load misses where the high part of the linear to +physical address translation was missed. +.It Li DTLB_LOAD_MISSES.LARGE_WALK_COMPLETED +.Pq Event 08H , Umask 80H +Counts number of completed large page walks due to load miss in the STLB. +.It Li MEM_INST_RETIRED.LOADS +.Pq Event 0BH , Umask 01H +Counts the number of instructions with an architecturally-visible store +retired on the architected path. +In conjunction with ld_lat facility +.It Li MEM_INST_RETIRED.STORES +.Pq Event 0BH , Umask 02H +Counts the number of instructions with an architecturally-visible store +retired on the architected path. +In conjunction with ld_lat facility +.It Li MEM_INST_RETIRED.LATENCY_ABOVE_THRESHOLD +.Pq Event 0BH , Umask 10H +Counts the number of instructions exceeding the latency specified with +ld_lat facility. +In conjunction with ld_lat facility +.It Li MEM_STORE_RETIRED.DTLB_MISS +.Pq Event 0CH , Umask 01H +The event counts the number of retired stores that missed the DTLB. The DTLB +miss is not counted if the store operation causes a fault. Does not counter +prefetches. Counts both primary and secondary misses to the TLB +.It Li UOPS_ISSUED.ANY +.Pq Event 0EH , Umask 01H +Counts the number of Uops issued by the Register Allocation Table to the +Reservation Station, i.e. the UOPs issued from the front end to the back +end. +.It Li UOPS_ISSUED.STALLED_CYCLES +.Pq Event 0EH , Umask 01H +Counts the number of cycles no Uops issued by the Register Allocation Table +to the Reservation Station, i.e. the UOPs issued from the front end to the +back end. +set invert=1, cmask = 1 +.It Li UOPS_ISSUED.FUSED +.Pq Event 0EH , Umask 02H +Counts the number of fused Uops that were issued from the Register +Allocation Table to the Reservation Station. +.It Li MEM_UNCORE_RETIRED.L3_DATA_MISS_UNKNOWN +.Pq Event 0FH , Umask 01H +Counts number of memory load instructions retired where the memory reference +missed L3 and data source is unknown. +Available only for CPUID signature 06_2EH +.It Li MEM_UNCORE_RETIRED.OTHER_CORE_L2_HITM +.Pq Event 0FH , Umask 02H +Counts number of memory load instructions retired where the memory reference +hit modified data in a sibling core residing on the same socket. +.It Li MEM_UNCORE_RETIRED.REMOTE_CACHE_LOCAL_HOME_HIT +.Pq Event 0FH , Umask 08H +Counts number of memory load instructions retired where the memory reference +missed the L1, L2 and L3 caches and HIT in a remote socket's cache. Only +counts locally homed lines. +.It Li MEM_UNCORE_RETIRED.REMOTE_DRAM +.Pq Event 0FH , Umask 10H +Counts number of memory load instructions retired where the memory reference +missed the L1, L2 and L3 caches and was remotely homed. This includes both +DRAM access and HITM in a remote socket's cache for remotely homed lines. +.It Li MEM_UNCORE_RETIRED.LOCAL_DRAM +.Pq Event 0FH , Umask 20H +Counts number of memory load instructions retired where the memory reference +missed the L1, L2 and L3 caches and required a local socket memory +reference. This includes locally homed cachelines that were in a modified +state in another socket. +.It Li MEM_UNCORE_RETIRED.UNCACHEABLE +.Pq Event 0FH , Umask 80H +Counts number of memory load instructions retired where the memory reference +missed the L1, L2 and L3 caches and to perform I/O. +Available only for CPUID signature 06_2EH +.It Li FP_COMP_OPS_EXE.X87 +.Pq Event 10H , Umask 01H +Counts the number of FP Computational Uops Executed. The number of FADD, +FSUB, FCOM, FMULs, integer MULsand IMULs, FDIVs, FPREMs, FSQRTS, integer +DIVs, and IDIVs. This event does not distinguish an FADD used in the middle +of a transcendental flow from a separate FADD instruction. +.It Li FP_COMP_OPS_EXE.MMX +.Pq Event 10H , Umask 02H +Counts number of MMX Uops executed. +.It Li FP_COMP_OPS_EXE.SSE_FP +.Pq Event 10H , Umask 04H +Counts number of SSE and SSE2 FP uops executed. +.It Li FP_COMP_OPS_EXE.SSE2_INTEGER +.Pq Event 10H , Umask 08H +Counts number of SSE2 integer uops executed. +.It Li FP_COMP_OPS_EXE.SSE_FP_PACKED +.Pq Event 10H , Umask 10H +Counts number of SSE FP packed uops executed. +.It Li FP_COMP_OPS_EXE.SSE_FP_SCALAR +.Pq Event 10H , Umask 20H +Counts number of SSE FP scalar uops executed. +.It Li FP_COMP_OPS_EXE.SSE_SINGLE_PRECISION +.Pq Event 10H , Umask 40H +Counts number of SSE* FP single precision uops executed. +.It Li FP_COMP_OPS_EXE.SSE_DOUBLE_PRECISION +.Pq Event 10H , Umask 80H +Counts number of SSE* FP double precision uops executed. +.It Li SIMD_INT_128.PACKED_MPY +.Pq Event 12H , Umask 01H +Counts number of 128 bit SIMD integer multiply operations. +.It Li SIMD_INT_128.PACKED_SHIFT +.Pq Event 12H , Umask 02H +Counts number of 128 bit SIMD integer shift operations. +.It Li SIMD_INT_128.PACK +.Pq Event 12H , Umask 04H +Counts number of 128 bit SIMD integer pack operations. +.It Li SIMD_INT_128.UNPACK +.Pq Event 12H , Umask 08H +Counts number of 128 bit SIMD integer unpack operations. +.It Li SIMD_INT_128.PACKED_LOGICAL +.Pq Event 12H , Umask 10H +Counts number of 128 bit SIMD integer logical operations. +.It Li SIMD_INT_128.PACKED_ARITH +.Pq Event 12H , Umask 20H +Counts number of 128 bit SIMD integer arithmetic operations. +.It Li SIMD_INT_128.SHUFFLE_MOVE +.Pq Event 12H , Umask 40H +Counts number of 128 bit SIMD integer shuffle and move operations. +.It Li LOAD_DISPATCH.RS +.Pq Event 13H , Umask 01H +Counts number of loads dispatched from the Reservation Station that bypass +the Memory Order Buffer. +.It Li LOAD_DISPATCH.RS_DELAYED +.Pq Event 13H , Umask 02H +Counts the number of delayed RS dispatches at the stage latch. If an RS +dispatch can not bypass to LB, it has another chance to dispatch from the +one-cycle delayed staging latch before it is written into the LB. +.It Li LOAD_DISPATCH.MOB +.Pq Event 13H , Umask 04H +Counts the number of loads dispatched from the Reservation Station to the +Memory Order Buffer. +.It Li LOAD_DISPATCH.ANY +.Pq Event 13H , Umask 07H +Counts all loads dispatched from the Reservation Station. +.It Li ARITH.CYCLES_DIV_BUSY +.Pq Event 14H , Umask 01H +Counts the number of cycles the divider is busy executing divide or square +root operations. The divide can be integer, X87 or Streaming SIMD Extensions +(SSE). The square root operation can be either X87 or SSE. +Set 'edge =1, invert=1, cmask=1' to count the number of divides. +Count may be incorrect When SMT is on. +.It Li ARITH.MUL +.Pq Event 14H , Umask 02H +Counts the number of multiply operations executed. This includes integer as +well as floating point multiply operations but excludes DPPS mul and MPSAD. +Count may be incorrect When SMT is on +.It Li INST_QUEUE_WRITES +.Pq Event 17H , Umask 01H +Counts the number of instructions written into the instruction queue every +cycle. +.It Li INST_DECODED.DEC0 +.Pq Event 18H , Umask 01H +Counts number of instructions that require decoder 0 to be decoded. Usually, +this means that the instruction maps to more than 1 uop +.It Li TWO_UOP_INSTS_DECODED +.Pq Event 19H , Umask 01H +An instruction that generates two uops was decoded +.It Li INST_QUEUE_WRITE_CYCLES +.Pq Event 1EH , Umask 01H +This event counts the number of cycles during which instructions are written +to the instruction queue. Dividing this counter by the number of +instructions written to the instruction queue (INST_QUEUE_WRITES) yields the +average number of instructions decoded each cycle. If this number is less +than four and the pipe stalls, this indicates that the decoder is failing to +decode enough instructions per cycle to sustain the 4-wide pipeline. +If SSE* instructions that are 6 bytes or longer arrive one after another, +then front end throughput may limit execution speed. In such case, +.It Li LSD_OVERFLOW +.Pq Event 20H , Umask 01H +Counts number of loops that cant stream from the instruction queue. +.It Li L2_RQSTS.LD_HIT +.Pq Event 24H , Umask 01H +Counts number of loads that hit the L2 cache. L2 loads include both L1D +demand misses as well as L1D prefetches. L2 loads can be rejected for +various reasons. Only non rejected loads are counted. +.It Li L2_RQSTS.LD_MISS +.Pq Event 24H , Umask 02H +Counts the number of loads that miss the L2 cache. L2 loads include both L1D +demand misses as well as L1D prefetches. +.It Li L2_RQSTS.LOADS +.Pq Event 24H , Umask 03H +Counts all L2 load requests. L2 loads include both L1D demand misses as well +as L1D prefetches. +.It Li L2_RQSTS.RFO_HIT +.Pq Event 24H , Umask 04H +Counts the number of store RFO requests that hit the L2 cache. L2 RFO +requests include both L1D demand RFO misses as well as L1D RFO prefetches. +Count includes WC memory requests, where the data is not fetched but the +permission to write the line is required. +.It Li L2_RQSTS.RFO_MISS +.Pq Event 24H , Umask 08H +Counts the number of store RFO requests that miss the L2 cache. L2 RFO +requests include both L1D demand RFO misses as well as L1D RFO prefetches. +.It Li L2_RQSTS.RFOS +.Pq Event 24H , Umask 0CH +Counts all L2 store RFO requests. L2 RFO requests include both L1D demand +RFO misses as well as L1D RFO prefetches. +.It Li L2_RQSTS.IFETCH_HIT +.Pq Event 24H , Umask 10H +Counts number of instruction fetches that hit the L2 cache. L2 instruction +fetches include both L1I demand misses as well as L1I instruction +prefetches. +.It Li L2_RQSTS.IFETCH_MISS +.Pq Event 24H , Umask 20H +Counts number of instruction fetches that miss the L2 cache. L2 instruction +fetches include both L1I demand misses as well as L1I instruction +prefetches. +.It Li L2_RQSTS.IFETCHES +.Pq Event 24H , Umask 30H +Counts all instruction fetches. L2 instruction fetches include both L1I +demand misses as well as L1I instruction prefetches. +.It Li L2_RQSTS.PREFETCH_HIT +.Pq Event 24H , Umask 40H +Counts L2 prefetch hits for both code and data. +.It Li L2_RQSTS.PREFETCH_MISS +.Pq Event 24H , Umask 80H +Counts L2 prefetch misses for both code and data. +.It Li L2_RQSTS.PREFETCHES +.Pq Event 24H , Umask C0H +Counts all L2 prefetches for both code and data. +.It Li L2_RQSTS.MISS +.Pq Event 24H , Umask AAH +Counts all L2 misses for both code and data. +.It Li L2_RQSTS.REFERENCES +.Pq Event 24H , Umask FFH +Counts all L2 requests for both code and data. +.It Li L2_DATA_RQSTS.DEMAND.I_STATE +.Pq Event 26H , Umask 01H +Counts number of L2 data demand loads where the cache line to be loaded is +in the I (invalid) state, i.e. a cache miss. L2 demand loads are both L1D +demand misses and L1D prefetches. +.It Li L2_DATA_RQSTS.DEMAND.S_STATE +.Pq Event 26H , Umask 02H +Counts number of L2 data demand loads where the cache line to be loaded is +in the S (shared) state. L2 demand loads are both L1D demand misses and L1D +prefetches. +.It Li L2_DATA_RQSTS.DEMAND.E_STATE +.Pq Event 26H , Umask 04H +Counts number of L2 data demand loads where the cache line to be loaded is +in the E (exclusive) state. L2 demand loads are both L1D demand misses and +L1D prefetches. +.It Li L2_DATA_RQSTS.DEMAND.M_STATE +.Pq Event 26H , Umask 08H +Counts number of L2 data demand loads where the cache line to be loaded is +in the M (modified) state. L2 demand loads are both L1D demand misses and +L1D prefetches. +.It Li L2_DATA_RQSTS.DEMAND.MESI +.Pq Event 26H , Umask 0FH +Counts all L2 data demand requests. L2 demand loads are both L1D demand +misses and L1D prefetches. +.It Li L2_DATA_RQSTS.PREFETCH.I_STATE +.Pq Event 26H , Umask 10H +Counts number of L2 prefetch data loads where the cache line to be loaded is +in the I (invalid) state, i.e. a cache miss. +.It Li L2_DATA_RQSTS.PREFETCH.S_STATE +.Pq Event 26H , Umask 20H +Counts number of L2 prefetch data loads where the cache line to be loaded is +in the S (shared) state. A prefetch RFO will miss on an S state line, while +a prefetch read will hit on an S state line. +.It Li L2_DATA_RQSTS.PREFETCH.E_STATE +.Pq Event 26H , Umask 40H +Counts number of L2 prefetch data loads where the cache line to be loaded is +in the E (exclusive) state. +.It Li L2_DATA_RQSTS.PREFETCH.M_STATE +.Pq Event 26H , Umask 80H +Counts number of L2 prefetch data loads where the cache line to be loaded is +in the M (modified) state. +.It Li L2_DATA_RQSTS.PREFETCH.MESI +.Pq Event 26H , Umask F0H +Counts all L2 prefetch requests. +.It Li L2_DATA_RQSTS.ANY +.Pq Event 26H , Umask FFH +Counts all L2 data requests. +.It Li L2_WRITE.RFO.I_STATE +.Pq Event 27H , Umask 01H +Counts number of L2 demand store RFO requests where the cache line to be +loaded is in the I (invalid) state, i.e, a cache miss. The L1D prefetcher +does not issue a RFO prefetch. +This is a demand RFO request +.It Li L2_WRITE.RFO.S_STATE +.Pq Event 27H , Umask 02H +Counts number of L2 store RFO requests where the cache line to be loaded is +in the S (shared) state. The L1D prefetcher does not issue a RFO prefetch,. +This is a demand RFO request +.It Li L2_WRITE.RFO.M_STATE +.Pq Event 27H , Umask 08H +Counts number of L2 store RFO requests where the cache line to be loaded is +in the M (modified) state. The L1D prefetcher does not issue a RFO prefetch. +This is a demand RFO request +.It Li L2_WRITE.RFO.HIT +.Pq Event 27H , Umask 0EH +Counts number of L2 store RFO requests where the cache line to be loaded is +in either the S, E or M states. The L1D prefetcher does not issue a RFO +prefetch. +This is a demand RFO request +.It Li L2_WRITE.RFO.MESI +.Pq Event 27H , Umask 0FH +Counts all L2 store RFO requests.The L1D prefetcher does not issue a RFO +prefetch. +This is a demand RFO request +.It Li L2_WRITE.LOCK.I_STATE +.Pq Event 27H , Umask 10H +Counts number of L2 demand lock RFO requests where the cache line to be +loaded is in the I (invalid) state, i.e. a cache miss. +.It Li L2_WRITE.LOCK.S_STATE +.Pq Event 27H , Umask 20H +Counts number of L2 lock RFO requests where the cache line to be loaded is +in the S (shared) state. +.It Li L2_WRITE.LOCK.E_STATE +.Pq Event 27H , Umask 40H +Counts number of L2 demand lock RFO requests where the cache line to be +loaded is in the E (exclusive) state. +.It Li L2_WRITE.LOCK.M_STATE +.Pq Event 27H , Umask 80H +Counts number of L2 demand lock RFO requests where the cache line to be +loaded is in the M (modified) state. +.It Li L2_WRITE.LOCK.HIT +.Pq Event 27H , Umask E0H +Counts number of L2 demand lock RFO requests where the cache line to be +loaded is in either the S, E, or M state. +.It Li L2_WRITE.LOCK.MESI +.Pq Event 27H , Umask F0H +Counts all L2 demand lock RFO requests. +.It Li L1D_WB_L2.I_STATE +.Pq Event 28H , Umask 01H +Counts number of L1 writebacks to the L2 where the cache line to be written +is in the I (invalid) state, i.e. a cache miss. +.It Li L1D_WB_L2.S_STATE +.Pq Event 28H , Umask 02H +Counts number of L1 writebacks to the L2 where the cache line to be written +is in the S state. +.It Li L1D_WB_L2.E_STATE +.Pq Event 28H , Umask 04H +Counts number of L1 writebacks to the L2 where the cache line to be written +is in the E (exclusive) state. +.It Li L1D_WB_L2.M_STATE +.Pq Event 28H , Umask 08H +Counts number of L1 writebacks to the L2 where the cache line to be written +is in the M (modified) state. +.It Li L1D_WB_L2.MESI +.Pq Event 28H , Umask 0FH +Counts all L1 writebacks to the L2. +.It Li L3_LAT_CACHE.REFERENCE +.Pq Event 2EH , Umask 4FH +This event counts requests originating from the core that reference a cache +line in the last level cache. The event count includes speculative traffic +but excludes cache line fills due to a L2 hardware-prefetch. Because cache +hierarchy, cache sizes and other implementation-specific characteristics; +value comparison to estimate performance differences is not recommended. +see Table A-1 +.It Li L3_LAT_CACHE.MISS +.Pq Event 2EH , Umask 41H +This event counts each cache miss condition for references to the last level +cache. The event count may include speculative traffic but excludes cache +line fills due to L2 hardware-prefetches. Because cache hierarchy, cache +sizes and other implementation-specific characteristics; value comparison to +estimate performance differences is not recommended. +see Table A-1 +.It Li CPU_CLK_UNHALTED.THREAD_P +.Pq Event 3CH , Umask 00H +Counts the number of thread cycles while the thread is not in a halt state. +The thread enters the halt state when it is running the HLT instruction. The +core frequency may change from time to time due to power or thermal +throttling. +see Table A-1 +.It Li CPU_CLK_UNHALTED.REF_P +.Pq Event 3CH , Umask 01H +Increments at the frequency of TSC when not halted. +see Table A-1 +.It Li L1D_CACHE_LD.I_STATE +.Pq Event 40H , Umask 01H +Counts L1 data cache read requests where the cache line to be loaded is in +the I (invalid) state, i.e. the read request missed the cache. +Counter 0, 1 only +.It Li L1D_CACHE_LD.S_STATE +.Pq Event 40H , Umask 02H +Counts L1 data cache read requests where the cache line to be loaded is in +the S (shared) state. +Counter 0, 1 only +.It Li L1D_CACHE_LD.E_STATE +.Pq Event 40H , Umask 04H +Counts L1 data cache read requests where the cache line to be loaded is in +the E (exclusive) state. +Counter 0, 1 only +.It Li L1D_CACHE_LD.M_STATE +.Pq Event 40H , Umask 08H +Counts L1 data cache read requests where the cache line to be loaded is in +the M (modified) state. +Counter 0, 1 only +.It Li L1D_CACHE_LD.MESI +.Pq Event 40H , Umask 0FH +Counts L1 data cache read requests. +Counter 0, 1 only +.It Li L1D_CACHE_ST.S_STATE +.Pq Event 41H , Umask 02H +Counts L1 data cache store RFO requests where the cache line to be loaded is +in the S (shared) state. +Counter 0, 1 only +.It Li L1D_CACHE_ST.E_STATE +.Pq Event 41H , Umask 04H +Counts L1 data cache store RFO requests where the cache line to be loaded is +in the E (exclusive) state. +Counter 0, 1 only +.It Li L1D_CACHE_ST.M_STATE +.Pq Event 41H , Umask 08H +Counts L1 data cache store RFO requests where cache line to be loaded is in +the M (modified) state. +Counter 0, 1 only +.It Li L1D_CACHE_LOCK.HIT +.Pq Event 42H , Umask 01H +Counts retired load locks that hit in the L1 data cache or hit in an already +allocated fill buffer. The lock portion of the load lock transaction must +hit in the L1D. +The initial load will pull the lock into the L1 data cache. Counter 0, 1 +only +.It Li L1D_CACHE_LOCK.S_STATE +.Pq Event 42H , Umask 02H +Counts L1 data cache retired load locks that hit the target cache line in +the shared state. +Counter 0, 1 only +.It Li L1D_CACHE_LOCK.E_STATE +.Pq Event 42H , Umask 04H +Counts L1 data cache retired load locks that hit the target cache line in +the exclusive state. +Counter 0, 1 only +.It Li L1D_CACHE_LOCK.M_STATE +.Pq Event 42H , Umask 08H +Counts L1 data cache retired load locks that hit the target cache line in +the modified state. +Counter 0, 1 only +.It Li L1D_ALL_REF.ANY +.Pq Event 43H , Umask 01H +Counts all references (uncached, speculated and retired) to the L1 data +cache, including all loads and stores with any memory types. The event +counts memory accesses only when they are actually performed. For example, a +load blocked by unknown store address and later performed is only counted +once. +The event does not include non- memory accesses, such as I/O accesses. +Counter 0, 1 only +.It Li L1D_ALL_REF.CACHEABLE +.Pq Event 43H , Umask 02H +Counts all data reads and writes (speculated and retired) from cacheable +memory, including locked operations. +Counter 0, 1 only +.It Li L1D_PEND_MISS.LOAD_BUFFERS_FULL +.Pq Event 48H , Umask 02H +Counts cycles of L1 data cache load fill buffers full. +Counter 0, 1 only +.It Li DTLB_MISSES.ANY +.Pq Event 49H , Umask 01H +Counts the number of misses in the STLB which causes a page walk. +.It Li DTLB_MISSES.WALK_COMPLETED +.Pq Event 49H , Umask 02H +Counts number of misses in the STLB which resulted in a completed page walk. +.It Li DTLB_MISSES.STLB_HIT +.Pq Event 49H , Umask 10H +Counts the number of DTLB first level misses that hit in the second level +TLB. This event is only relevant if the core contains multiple DTLB levels. +.It Li LOAD_HIT_PRE +.Pq Event 4CH , Umask 01H +Counts load operations sent to the L1 data cache while a previous SSE +prefetch instruction to the same cache line has started prefetching but has +not yet finished. +.It Li L1D_PREFETCH.REQUESTS +.Pq Event 4EH , Umask 01H +Counts number of hardware prefetch requests dispatched out of the prefetch +FIFO. +.It Li L1D_PREFETCH.MISS +.Pq Event 4EH , Umask 02H +Counts number of hardware prefetch requests that miss the L1D. There are two +prefetchers in the L1D. A streamer, which predicts lines sequentially after +this one should be fetched, and the IP prefetcher that remembers access +patterns for the current instruction. The streamer prefetcher stops on an +L1D hit, while the IP prefetcher does not. +.It Li L1D_PREFETCH.TRIGGERS +.Pq Event 4EH , Umask 04H +Counts number of prefetch requests triggered by the Finite State Machine and +pushed into the prefetch FIFO. Some of the prefetch requests are dropped due +to overwrites or competition between the IP index prefetcher and streamer +prefetcher. The prefetch FIFO contains 4 entries. +.It Li L1D.REPL +.Pq Event 51H , Umask 01H +Counts the number of lines brought into the L1 data cache. +Counter 0, 1 only +.It Li L1D.M_REPL +.Pq Event 51H , Umask 02H +Counts the number of modified lines brought into the L1 data cache. +Counter 0, 1 only +.It Li L1D.M_EVICT +.Pq Event 51H , Umask 04H +Counts the number of modified lines evicted from the L1 data cache due to +replacement. +Counter 0, 1 only +.It Li L1D.M_SNOOP_EVICT +.Pq Event 51H , Umask 08H +Counts the number of modified lines evicted from the L1 data cache due to +snoop HITM intervention. +Counter 0, 1 only +.It Li L1D_CACHE_PREFETCH_LOCK_FB_HIT +.Pq Event 52H , Umask 01H +Counts the number of cacheable load lock speculated instructions accepted +into the fill buffer. +.It Li L1D_CACHE_LOCK_FB_HIT +.Pq Event 53H , Umask 01H +Counts the number of cacheable load lock speculated or retired instructions +accepted into the fill buffer. +.It Li CACHE_LOCK_CYCLES.L1D_L2 +.Pq Event 63H , Umask 01H +Cycle count during which the L1D and L2 are locked. A lock is asserted when +there is a locked memory access, due to uncacheable memory, a locked +operation that spans two cache lines, or a page walk from an uncacheable +page table. +Counter 0, 1 only. L1D and L2 locks have a very high performance penalty and +it is highly recommended to avoid such accesses. +.It Li CACHE_LOCK_CYCLES.L1D +.Pq Event 63H , Umask 02H +Counts the number of cycles that cacheline in the L1 data cache unit is +locked. +Counter 0, 1 only. +.It Li IO_TRANSACTIONS +.Pq Event 6CH , Umask 01H +Counts the number of completed I/O transactions. +.It Li L1I.HITS +.Pq Event 80H , Umask 01H +Counts all instruction fetches that hit the L1 instruction cache. +.It Li L1I.MISSES +.Pq Event 80H , Umask 02H +Counts all instruction fetches that miss the L1I cache. This includes +instruction cache misses, streaming buffer misses, victim cache misses and +uncacheable fetches. An instruction fetch miss is counted only once and not +once for every cycle it is outstanding. +.It Li L1I.READS +.Pq Event 80H , Umask 03H +Counts all instruction fetches, including uncacheable fetches that bypass +the L1I. +.It Li L1I.CYCLES_STALLED +.Pq Event 80H , Umask 04H +Cycle counts for which an instruction fetch stalls due to a L1I cache miss, +ITLB miss or ITLB fault. +.It Li LARGE_ITLB.HIT +.Pq Event 82H , Umask 01H +Counts number of large ITLB hits. +.It Li ITLB_MISSES.ANY +.Pq Event 85H , Umask 01H +Counts the number of misses in all levels of the ITLB which causes a page +walk. +.It Li ITLB_MISSES.WALK_COMPLETED +.Pq Event 85H , Umask 02H +Counts number of misses in all levels of the ITLB which resulted in a +completed page walk. +.It Li ILD_STALL.LCP +.Pq Event 87H , Umask 01H +Cycles Instruction Length Decoder stalls due to length changing prefixes: +66, 67 or REX.W (for EM64T) instructions which change the length of the +decoded instruction. +.It Li ILD_STALL.MRU +.Pq Event 87H , Umask 02H +Instruction Length Decoder stall cycles due to Brand Prediction Unit (PBU) +Most Recently Used (MRU) bypass. +.It Li ILD_STALL.IQ_FULL +.Pq Event 87H , Umask 04H +Stall cycles due to a full instruction queue. +.It Li ILD_STALL.REGEN +.Pq Event 87H , Umask 08H +Counts the number of regen stalls. +.It Li ILD_STALL.ANY +.Pq Event 87H , Umask 0FH +Counts any cycles the Instruction Length Decoder is stalled. +.It Li BR_INST_EXEC.COND +.Pq Event 88H , Umask 01H +Counts the number of conditional near branch instructions executed, but not +necessarily retired. +.It Li BR_INST_EXEC.DIRECT +.Pq Event 88H , Umask 02H +Counts all unconditional near branch instructions excluding calls and +indirect branches. +.It Li BR_INST_EXEC.INDIRECT_NON_CALL +.Pq Event 88H , Umask 04H +Counts the number of executed indirect near branch instructions that are not +calls. +.It Li BR_INST_EXEC.NON_CALLS +.Pq Event 88H , Umask 07H +Counts all non call near branch instructions executed, but not necessarily +retired. +.It Li BR_INST_EXEC.RETURN_NEAR +.Pq Event 88H , Umask 08H +Counts indirect near branches that have a return mnemonic. +.It Li BR_INST_EXEC.DIRECT_NEAR_CALL +.Pq Event 88H , Umask 10H +Counts unconditional near call branch instructions, excluding non call +branch, executed. +.It Li BR_INST_EXEC.INDIRECT_NEAR_CALL +.Pq Event 88H , Umask 20H +Counts indirect near calls, including both register and memory indirect, +executed. +.It Li BR_INST_EXEC.NEAR_CALLS +.Pq Event 88H , Umask 30H +Counts all near call branches executed, but not necessarily retired. +.It Li BR_INST_EXEC.TAKEN +.Pq Event 88H , Umask 40H +Counts taken near branches executed, but not necessarily retired. +.It Li BR_INST_EXEC.ANY +.Pq Event 88H , Umask 7FH +Counts all near executed branches (not necessarily retired). This includes +only instructions and not micro-op branches. Frequent branching is not +necessarily a major performance issue. However frequent branch +mispredictions may be a problem. +.It Li BR_MISP_EXEC.COND +.Pq Event 89H , Umask 01H +Counts the number of mispredicted conditional near branch instructions +executed, but not necessarily retired. +.It Li BR_MISP_EXEC.DIRECT +.Pq Event 89H , Umask 02H +Counts mispredicted macro unconditional near branch instructions, excluding +calls and indirect branches (should always be 0). +.It Li BR_MISP_EXEC.INDIRECT_NON_CALL +.Pq Event 89H , Umask 04H +Counts the number of executed mispredicted indirect near branch instructions +that are not calls. +.It Li BR_MISP_EXEC.NON_CALLS +.Pq Event 89H , Umask 07H +Counts mispredicted non call near branches executed, but not necessarily +retired. +.It Li BR_MISP_EXEC.RETURN_NEAR +.Pq Event 89H , Umask 08H +Counts mispredicted indirect branches that have a rear return mnemonic. +.It Li BR_MISP_EXEC.DIRECT_NEAR_CALL +.Pq Event 89H , Umask 10H +Counts mispredicted non-indirect near calls executed, (should always be 0). +.It Li BR_MISP_EXEC.INDIRECT_NEAR_CALL +.Pq Event 89H , Umask 20H +Counts mispredicted indirect near calls executed, including both register +and memory indirect. +.It Li BR_MISP_EXEC.NEAR_CALLS +.Pq Event 89H , Umask 30H +Counts all mispredicted near call branches executed, but not necessarily +retired. +.It Li BR_MISP_EXEC.TAKEN +.Pq Event 89H , Umask 40H +Counts executed mispredicted near branches that are taken, but not +necessarily retired. +.It Li BR_MISP_EXEC.ANY +.Pq Event 89H , Umask 7FH +Counts the number of mispredicted near branch instructions that were +executed, but not necessarily retired. +.It Li RESOURCE_STALLS.ANY +.Pq Event A2H , Umask 01H +Counts the number of Allocator resource related stalls. Includes register +renaming buffer entries, memory buffer entries. In addition to resource +related stalls, this event counts some other events. Includes stalls arising +during branch misprediction recovery, such as if retirement of the +mispredicted branch is delayed and stalls arising while store buffer is +draining from synchronizing operations. +Does not include stalls due to SuperQ (off core) queue full, too many cache +misses, etc. +.It Li RESOURCE_STALLS.LOAD +.Pq Event A2H , Umask 02H +Counts the cycles of stall due to lack of load buffer for load operation. +.It Li RESOURCE_STALLS.RS_FULL +.Pq Event A2H , Umask 04H +This event counts the number of cycles when the number of instructions in +the pipeline waiting for execution reaches the limit the processor can +handle. A high count of this event indicates that there are long latency +operations in the pipe (possibly load and store operations that miss the L2 +cache, or instructions dependent upon instructions further down the pipeline +that have yet to retire. +When RS is full, new instructions can not enter the reservation station and +start execution. +.It Li RESOURCE_STALLS.STORE +.Pq Event A2H , Umask 08H +This event counts the number of cycles that a resource related stall will +occur due to the number of store instructions reaching the limit of the +pipeline, (i.e. all store buffers are used). The stall ends when a store +instruction commits its data to the cache or memory. +.It Li RESOURCE_STALLS.ROB_FULL +.Pq Event A2H , Umask 10H +Counts the cycles of stall due to re- order buffer full. +.It Li RESOURCE_STALLS.FPCW +.Pq Event A2H , Umask 20H +Counts the number of cycles while execution was stalled due to writing the +floating-point unit (FPU) control word. +.It Li RESOURCE_STALLS.MXCSR +.Pq Event A2H , Umask 40H +Stalls due to the MXCSR register rename occurring to close to a previous +MXCSR rename. The MXCSR provides control and status for the MMX registers. +.It Li RESOURCE_STALLS.OTHER +.Pq Event A2H , Umask 80H +Counts the number of cycles while execution was stalled due to other +resource issues. +.It Li MACRO_INSTS.FUSIONS_DECODED +.Pq Event A6H , Umask 01H +Counts the number of instructions decoded that are macro-fused but not +necessarily executed or retired. +.It Li BACLEAR_FORCE_IQ +.Pq Event A7H , Umask 01H +Counts number of times a BACLEAR was forced by the Instruction Queue. The IQ +is also responsible for providing conditional branch prediction direction +based on a static scheme and dynamic data provided by the L2 Branch +Prediction Unit. If the conditional branch target is not found in the Target +Array and the IQ predicts that the branch is taken, then the IQ will force +the Branch Address Calculator to issue a BACLEAR. Each BACLEAR asserted by +the BAC generates approximately an 8 cycle bubble in the instruction fetch +pipeline. +.It Li LSD.UOPS +.Pq Event A8H , Umask 01H +Counts the number of micro-ops delivered by loop stream detector +Use cmask=1 and invert to count cycles +.It Li ITLB_FLUSH +.Pq Event AEH , Umask 01H +Counts the number of ITLB flushes +.It Li OFFCORE_REQUESTS.L1D_WRITEBACK +.Pq Event B0H , Umask 40H +Counts number of L1D writebacks to the uncore. +.It Li UOPS_EXECUTED.PORT0 +.Pq Event B1H , Umask 01H +Counts number of Uops executed that were issued on port 0. Port 0 handles +integer arithmetic, SIMD and FP add Uops. +.It Li UOPS_EXECUTED.PORT1 +.Pq Event B1H , Umask 02H +Counts number of Uops executed that were issued on port 1. Port 1 handles +integer arithmetic, SIMD, integer shift, FP multiply and FP divide Uops. +.It Li UOPS_EXECUTED.PORT2_CORE +.Pq Event B1H , Umask 04H +Counts number of Uops executed that were issued on port 2. Port 2 handles +the load Uops. This is a core count only and can not be collected per +thread. +.It Li UOPS_EXECUTED.PORT3_CORE +.Pq Event B1H , Umask 08H +Counts number of Uops executed that were issued on port 3. Port 3 handles +store Uops. This is a core count only and can not be collected per thread. +.It Li UOPS_EXECUTED.PORT4_CORE +.Pq Event B1H , Umask 10H +Counts number of Uops executed that where issued on port 4. Port 4 handles +the value to be stored for the store Uops issued on port 3. This is a core +count only and can not be collected per thread. +.It Li UOPS_EXECUTED.CORE_ACTIVE_CYCLES_NO_PORT5 +.Pq Event B1H , Umask 1FH +Counts cycles when the Uops executed were issued from any ports except port +5. Use Cmask=1 for active cycles; Cmask=0 for weighted cycles; Use CMask=1, +Invert=1 to count P0-4 stalled cycles Use Cmask=1, Edge=1, Invert=1 to count +P0-4 stalls. +.It Li UOPS_EXECUTED.PORT5 +.Pq Event B1H , Umask 20H +Counts number of Uops executed that where issued on port 5. +.It Li UOPS_EXECUTED.CORE_ACTIVE_CYCLES +.Pq Event B1H , Umask 3FH +Counts cycles when the Uops are executing. Use Cmask=1 for active cycles; +Cmask=0 for weighted cycles; Use CMask=1, Invert=1 to count P0-4 stalled +cycles Use Cmask=1, Edge=1, Invert=1 to count P0-4 stalls. +.It Li UOPS_EXECUTED.PORT015 +.Pq Event B1H , Umask 40H +Counts number of Uops executed that where issued on port 0, 1, or 5. +use cmask=1, invert=1 to count stall cycles +.It Li UOPS_EXECUTED.PORT234 +.Pq Event B1H , Umask 80H +Counts number of Uops executed that where issued on port 2, 3, or 4. +.It Li OFFCORE_REQUESTS_SQ_FULL +.Pq Event B2H , Umask 01H +Counts number of cycles the SQ is full to handle off-core requests. +.It Li OFF_CORE_RESPONSE_0 +.Pq Event B7H , Umask 01H +see Section 30.6.1.3, Off-core Response Performance Monitoring in the +Processor Core +Requires programming MSR 01A6H +.It Li SNOOP_RESPONSE.HIT +.Pq Event B8H , Umask 01H +Counts HIT snoop response sent by this thread in response to a snoop +request. +.It Li SNOOP_RESPONSE.HITE +.Pq Event B8H , Umask 02H +Counts HIT E snoop response sent by this thread in response to a snoop +request. +.It Li SNOOP_RESPONSE.HITM +.Pq Event B8H , Umask 04H +Counts HIT M snoop response sent by this thread in response to a snoop +request. +.It Li OFF_CORE_RESPONSE_1 +.Pq Event BBH , Umask 01H +see Section 30.6.1.3, Off-core Response Performance Monitoring in the +Processor Core +Requires programming MSR 01A7H +.It Li INST_RETIRED.ANY_P +.Pq Event C0H , Umask 01H +See Table A-1 +Notes: INST_RETIRED.ANY is counted by a designated fixed counter. +INST_RETIRED.ANY_P is counted by a programmable counter and is an +architectural performance event. Event is supported if CPUID.A.EBX[1] = 0. +Counting: Faulting executions of GETSEC/VM entry/VM Exit/MWait will not +count as retired instructions. +.It Li INST_RETIRED.X87 +.Pq Event C0H , Umask 02H +Counts the number of MMX instructions retired. +.It Li INST_RETIRED.MMX +.Pq Event C0H , Umask 04H +Counts the number of floating point computational operations retired: +floating point computational operations executed by the assist handler and +sub-operations of complex floating point instructions like transcendental +instructions. +.It Li UOPS_RETIRED.ANY +.Pq Event C2H , Umask 01H +Counts the number of micro-ops retired, (macro-fused=1, micro- fused=2, +others=1; maximum count of 8 per cycle). Most instructions are composed of +one or two micro-ops. Some instructions are decoded into longer sequences +such as repeat instructions, floating point transcendental instructions, and +assists. +Use cmask=1 and invert to count active cycles or stalled cycles +.It Li UOPS_RETIRED.RETIRE_SLOTS +.Pq Event C2H , Umask 02H +Counts the number of retirement slots used each cycle +.It Li UOPS_RETIRED.MACRO_FUSED +.Pq Event C2H , Umask 04H +Counts number of macro-fused uops retired. +.It Li MACHINE_CLEARS.CYCLES +.Pq Event C3H , Umask 01H +Counts the cycles machine clear is asserted. +.It Li MACHINE_CLEARS.MEM_ORDER +.Pq Event C3H , Umask 02H +Counts the number of machine clears due to memory order conflicts. +.It Li MACHINE_CLEARS.SMC +.Pq Event C3H , Umask 04H +Counts the number of times that a program writes to a code section. +Self-modifying code causes a sever penalty in all Intel 64 and IA-32 +processors. The modified cache line is written back to the L2 and L3caches. +.It Li BR_INST_RETIRED.ALL_BRANCHES +.Pq Event C4H , Umask 00H +See Table A-1 +.It Li BR_INST_RETIRED.CONDITIONAL +.Pq Event C4H , Umask 01H +Counts the number of conditional branch instructions retired. +.It Li BR_INST_RETIRED.NEAR_CALL +.Pq Event C4H , Umask 02H +Counts the number of direct & indirect near unconditional calls retired +.It Li BR_INST_RETIRED.ALL_BRANCHES +.Pq Event C4H , Umask 04H +Counts the number of branch instructions retired +.It Li BR_MISP_RETIRED.ALL_BRANCHES +.Pq Event C5H , Umask 00H +See Table A-1 +.It Li BR_MISP_RETIRED.NEAR_CALL +.Pq Event C5H , Umask 02H +Counts mispredicted direct & indirect near unconditional retired calls. +.It Li SSEX_UOPS_RETIRED.PACKED_SINGLE +.Pq Event C7H , Umask 01H +Counts SIMD packed single-precision floating point Uops retired. +.It Li SSEX_UOPS_RETIRED.SCALAR_SINGLE +.Pq Event C7H , Umask 02H +Counts SIMD calar single-precision floating point Uops retired. +.It Li SSEX_UOPS_RETIRED.PACKED_DOUBLE +.Pq Event C7H , Umask 04H +Counts SIMD packed double- precision floating point Uops retired. +.It Li SSEX_UOPS_RETIRED.SCALAR_DOUBLE +.Pq Event C7H , Umask 08H +Counts SIMD scalar double-precision floating point Uops retired. +.It Li SSEX_UOPS_RETIRED.VECTOR_INTEGER +.Pq Event C7H , Umask 10H +Counts 128-bit SIMD vector integer Uops retired. +.It Li ITLB_MISS_RETIRED +.Pq Event C8H , Umask 20H +Counts the number of retired instructions that missed the ITLB when the +instruction was fetched. +.It Li MEM_LOAD_RETIRED.L1D_HIT +.Pq Event CBH , Umask 01H +Counts number of retired loads that hit the L1 data cache. +.It Li MEM_LOAD_RETIRED.L2_HIT +.Pq Event CBH , Umask 02H +Counts number of retired loads that hit the L2 data cache. +.It Li MEM_LOAD_RETIRED.L3_UNSHARED_HIT +.Pq Event CBH , Umask 04H +Counts number of retired loads that hit their own, unshared lines in the L3 +cache. +.It Li MEM_LOAD_RETIRED.OTHER_CORE_L2_HIT_HITM +.Pq Event CBH , Umask 08H +Counts number of retired loads that hit in a sibling core's L2 (on die +core). Since the L3 is inclusive of all cores on the package, this is an L3 +hit. This counts both clean or modified hits. +.It Li MEM_LOAD_RETIRED.L3_MISS +.Pq Event CBH , Umask 10H +Counts number of retired loads that miss the L3 cache. The load was +satisfied by a remote socket, local memory or an IOH. +.It Li MEM_LOAD_RETIRED.HIT_LFB +.Pq Event CBH , Umask 40H +Counts number of retired loads that miss the L1D and the address is located +in an allocated line fill buffer and will soon be committed to cache. This +is counting secondary L1D misses. +.It Li MEM_LOAD_RETIRED.DTLB_MISS +.Pq Event CBH , Umask 80H +Counts the number of retired loads that missed the DTLB. The DTLB miss is +not counted if the load operation causes a fault. This event counts loads +from cacheable memory only. The event does not count loads by software +prefetches. Counts both primary and secondary misses to the TLB. +.It Li FP_MMX_TRANS.TO_FP +.Pq Event CCH , Umask 01H +Counts the first floating-point instruction following any MMX instruction. +You can use this event to estimate the penalties for the transitions between +floating-point and MMX technology states. +.It Li FP_MMX_TRANS.TO_MMX +.Pq Event CCH , Umask 02H +Counts the first MMX instruction following a floating-point instruction. You +can use this event to estimate the penalties for the transitions between +floating-point and MMX technology states. +.It Li FP_MMX_TRANS.ANY +.Pq Event CCH , Umask 03H +Counts all transitions from floating point to MMX instructions and from MMX +instructions to floating point instructions. You can use this event to +estimate the penalties for the transitions between floating-point and MMX +technology states. +.It Li MACRO_INSTS.DECODED +.Pq Event D0H , Umask 01H +Counts the number of instructions decoded, (but not necessarily executed or +retired). +.It Li UOPS_DECODED.MS +.Pq Event D1H , Umask 02H +Counts the number of Uops decoded by the Microcode Sequencer, MS. The MS +delivers uops when the instruction is more than 4 uops long or a microcode +assist is occurring. +.It Li UOPS_DECODED.ESP_FOLDING +.Pq Event D1H , Umask 04H +Counts number of stack pointer (ESP) instructions decoded: push , pop , call +, ret, etc. ESP instructions do not generate a Uop to increment or decrement +ESP. Instead, they update an ESP_Offset register that keeps track of the +delta to the current value of the ESP register. +.It Li UOPS_DECODED.ESP_SYNC +.Pq Event D1H , Umask 08H +Counts number of stack pointer (ESP) sync operations where an ESP +instruction is corrected by adding the ESP offset register to the current +value of the ESP register. +.It Li RAT_STALLS.FLAGS +.Pq Event D2H , Umask 01H +Counts the number of cycles during which execution stalled due to several +reasons, one of which is a partial flag register stall. A partial register +stall may occur when two conditions are met: 1) an instruction modifies +some, but not all, of the flags in the flag register and 2) the next +instruction, which depends on flags, depends on flags that were not modified +by this instruction. +.It Li RAT_STALLS.REGISTERS +.Pq Event D2H , Umask 02H +This event counts the number of cycles instruction execution latency became +longer than the defined latency because the instruction used a register that +was partially written by previous instruction. +.It Li RAT_STALLS.ROB_READ_PORT +.Pq Event D2H , Umask 04H +Counts the number of cycles when ROB read port stalls occurred, which did +not allow new micro-ops to enter the out-of-order pipeline. Note that, at +this stage in the pipeline, additional stalls may occur at the same cycle +and prevent the stalled micro-ops from entering the pipe. In such a case, +micro-ops retry entering the execution pipe in the next cycle and the +ROB-read port stall is counted again. +.It Li RAT_STALLS.SCOREBOARD +.Pq Event D2H , Umask 08H +Counts the cycles where we stall due to microarchitecturally required +serialization. Microcode scoreboarding stalls. +.It Li RAT_STALLS.ANY +.Pq Event D2H , Umask 0FH +Counts all Register Allocation Table stall cycles due to: Cycles when ROB +read port stalls occurred, which did not allow new micro-ops to enter the +execution pipe. Cycles when partial register stalls occurred Cycles when +flag stalls occurred Cycles floating-point unit (FPU) status word stalls +occurred. To count each of these conditions separately use the events: +RAT_STALLS.ROB_READ_PORT, RAT_STALLS.PARTIAL, RAT_STALLS.FLAGS, and +RAT_STALLS.FPSW. +.It Li SEG_RENAME_STALLS +.Pq Event D4H , Umask 01H +Counts the number of stall cycles due to the lack of renaming resources for +the ES, DS, FS, and GS segment registers. If a segment is renamed but not +retired and a second update to the same segment occurs, a stall occurs in +the front-end of the pipeline until the renamed segment retires. +.It Li ES_REG_RENAMES +.Pq Event D5H , Umask 01H +Counts the number of times the ES segment register is renamed. +.It Li UOP_UNFUSION +.Pq Event DBH , Umask 01H +Counts unfusion events due to floating point exception to a fused uop. +.It Li BR_INST_DECODED +.Pq Event E0H , Umask 01H +Counts the number of branch instructions decoded. +.It Li BPU_MISSED_CALL_RET +.Pq Event E5H , Umask 01H +Counts number of times the Branch Prediction Unit missed predicting a call +or return branch. +.It Li BACLEAR.CLEAR +.Pq Event E6H , Umask 01H +Counts the number of times the front end is resteered, mainly when the +Branch Prediction Unit cannot provide a correct prediction and this is +corrected by the Branch Address Calculator at the front end. This can occur +if the code has many branches such that they cannot be consumed by the BPU. +Each BACLEAR asserted by the BAC generates approximately an 8 cycle bubble +in the instruction fetch pipeline. The effect on total execution time +depends on the surrounding code. +.It Li BACLEAR.BAD_TARGET +.Pq Event E6H , Umask 02H +Counts number of Branch Address Calculator clears (BACLEAR) asserted due to +conditional branch instructions in which there was a target hit but the +direction was wrong. Each BACLEAR asserted by the BAC generates +approximately an 8 cycle bubble in the instruction fetch pipeline. +.It Li BPU_CLEARS.EARLY +.Pq Event E8H , Umask 01H +Counts early (normal) Branch Prediction Unit clears: BPU predicted a taken +branch after incorrectly assuming that it was not taken. +The BPU clear leads to 2 cycle bubble in the Front End. +.It Li BPU_CLEARS.LATE +.Pq Event E8H , Umask 02H +Counts late Branch Prediction Unit clears due to Most Recently Used +conflicts. The PBU clear leads to a 3 cycle bubble in the Front End. +.It Li BPU_CLEARS.ANY +.Pq Event E8H , Umask 03H +Counts all BPU clears. +.It Li L2_TRANSACTIONS.LOAD +.Pq Event F0H , Umask 01H +Counts L2 load operations due to HW prefetch or demand loads. +.It Li L2_TRANSACTIONS.RFO +.Pq Event F0H , Umask 02H +Counts L2 RFO operations due to HW prefetch or demand RFOs. +.It Li L2_TRANSACTIONS.IFETCH +.Pq Event F0H , Umask 04H +Counts L2 instruction fetch operations due to HW prefetch or demand ifetch. +.It Li L2_TRANSACTIONS.PREFETCH +.Pq Event F0H , Umask 08H +Counts L2 prefetch operations. +.It Li L2_TRANSACTIONS.L1D_WB +.Pq Event F0H , Umask 10H +Counts L1D writeback operations to the L2. +.It Li L2_TRANSACTIONS.FILL +.Pq Event F0H , Umask 20H +Counts L2 cache line fill operations due to load, RFO, L1D writeback or +prefetch. +.It Li L2_TRANSACTIONS.WB +.Pq Event F0H , Umask 40H +Counts L2 writeback operations to the L3. +.It Li L2_TRANSACTIONS.ANY +.Pq Event F0H , Umask 80H +Counts all L2 cache operations. +.It Li L2_LINES_IN.S_STATE +.Pq Event F1H , Umask 02H +Counts the number of cache lines allocated in the L2 cache in the S (shared) +state. +.It Li L2_LINES_IN.E_STATE +.Pq Event F1H , Umask 04H +Counts the number of cache lines allocated in the L2 cache in the E +(exclusive) state. +.It Li L2_LINES_IN.ANY +.Pq Event F1H , Umask 07H +Counts the number of cache lines allocated in the L2 cache. +.It Li L2_LINES_OUT.DEMAND_CLEAN +.Pq Event F2H , Umask 01H +Counts L2 clean cache lines evicted by a demand request. +.It Li L2_LINES_OUT.DEMAND_DIRTY +.Pq Event F2H , Umask 02H +Counts L2 dirty (modified) cache lines evicted by a demand request. +.It Li L2_LINES_OUT.PREFETCH_CLEAN +.Pq Event F2H , Umask 04H +Counts L2 clean cache line evicted by a prefetch request. +.It Li L2_LINES_OUT.PREFETCH_DIRTY +.Pq Event F2H , Umask 08H +Counts L2 modified cache line evicted by a prefetch request. +.It Li L2_LINES_OUT.ANY +.Pq Event F2H , Umask 0FH +Counts all L2 cache lines evicted for any reason. +.It Li SQ_MISC.SPLIT_LOCK +.Pq Event F4H , Umask 10H +Counts the number of SQ lock splits across a cache line. +.It Li SQ_FULL_STALL_CYCLES +.Pq Event F6H , Umask 01H +Counts cycles the Super Queue is full. Neither of the threads on this core +will be able to access the uncore. +.It Li FP_ASSIST.ALL +.Pq Event F7H , Umask 01H +Counts the number of floating point operations executed that required +micro-code assist intervention. Assists are required in the following cases: +SSE instructions, (Denormal input when the DAZ flag is off or Underflow +result when the FTZ flag is off): x87 instructions, (NaN or denormal are +loaded to a register or used as input from memory, Division by 0 or +Underflow output). +.It Li FP_ASSIST.OUTPUT +.Pq Event F7H , Umask 02H +Counts number of floating point micro-code assist when the output value +(destination register) is invalid. +.It Li FP_ASSIST.INPUT +.Pq Event F7H , Umask 04H +Counts number of floating point micro-code assist when the input value (one +of the source operands to an FP instruction) is invalid. +.It Li SIMD_INT_64.PACKED_MPY +.Pq Event FDH , Umask 01H +Counts number of SID integer 64 bit packed multiply operations. +.It Li SIMD_INT_64.PACKED_SHIFT +.Pq Event FDH , Umask 02H +Counts number of SID integer 64 bit packed shift operations. +.It Li SIMD_INT_64.PACK +.Pq Event FDH , Umask 04H +Counts number of SID integer 64 bit pack operations. +.It Li SIMD_INT_64.UNPACK +.Pq Event FDH , Umask 08H +Counts number of SID integer 64 bit unpack operations. +.It Li SIMD_INT_64.PACKED_LOGICAL +.Pq Event FDH , Umask 10H +Counts number of SID integer 64 bit logical operations. +.It Li SIMD_INT_64.PACKED_ARITH +.Pq Event FDH , Umask 20H +Counts number of SID integer 64 bit arithmetic operations. +.It Li SIMD_INT_64.SHUFFLE_MOVE +.Pq Event FDH , Umask 40H +Counts number of SID integer 64 bit shift or move operations. +.El +.Ss Event Specifiers (Programmable PMCs) +Core i7 and Xeon 5500 programmable PMCs support the following events as +June 2009 document (removed in December 2009): +.Bl -tag -width indent +.It Li SB_FORWARD.ANY +.Pq Event 02H , Umask 01H +Counts the number of store forwards. +.It Li LOAD_BLOCK.STD +.Pq Event 03H , Umask 01H +Counts the number of loads blocked by a preceding store with unknown data. +.It Li LOAD_BLOCK.ADDRESS_OFFSET +.Pq Event 03H , Umask 04H +Counts the number of loads blocked by a preceding store address. +.It Li LOAD_BLOCK.ADDRESS_OFFSET +.Pq Event 01H , Umask 04H +Counts the cycles of store buffer drains. +.It Li MISALIGN_MEM_REF.LOAD +.Pq Event 05H , Umask 01H +Counts the number of misaligned load references +.It Li MISALIGN_MEM_REF.STORE +.Pq Event 05H , Umask 02H +Counts the number of misaligned store references +.It Li MISALIGN_MEM_REF.ANY +.Pq Event 05H , Umask 03H +Counts the number of misaligned memory references +.It Li STORE_BLOCKS.NOT_STA +.Pq Event 06H , Umask 01H +This event counts the number of load operations delayed caused by preceding +stores whose addresses are known but whose data is unknown, and preceding +stores that conflict with the load but which incompletely overlap the load. +.It Li STORE_BLOCKS.STA +.Pq Event 06H , Umask 02H +This event counts load operations delayed caused by preceding stores whose +addresses are unknown (STA block). +.It Li STORE_BLOCKS.ANY +.Pq Event 06H , Umask 0FH +All loads delayed due to store blocks +.It Li MEMORY_DISAMBIGURATION.RESET +.Pq Event 09H , Umask 01H +Counts memory disambiguration reset cycles +.It Li MEMORY_DISAMBIGURATION.SUCCESS +.Pq Event 09H , Umask 02H +Counts the number of loads that memory disambiguration succeeded +.It Li MEMORY_DISAMBIGURATION.WATCHDOG +.Pq Event 09H , Umask 04H +Counts the number of times the memory disambiguration watchdog kicked in. +.It Li MEMORY_DISAMBIGURATION.WATCH_CYCLES +.Pq Event 09H , Umask 08H +Counts the cycles that the memory disambiguration watchdog is active. +set invert=1, cmask = 1 +.It Li HW_INT.RCV +.Pq Event 1DH , Umask 01H +Number of interrupt received +.It Li HW_INT.CYCLES_MASKED +.Pq Event 1DH , Umask 02H +Number of cycles interrupt are masked +.It Li HW_INT.CYCLES_PENDING_AND_MASKED +.Pq Event 1DH , Umask 04H +Number of cycles interrupts are pending and masked +.It Li HW_INT.CYCLES_PENDING_AND_MASKED +.Pq Event 04H , Umask 04H +Counts number of L2 store RFO requests where the cache line to be loaded is +in the E (exclusive) state. The L1D prefetcher does not issue a RFO +prefetch. +This is a demand RFO request +.It Li HW_INT.CYCLES_PENDING_AND_MASKED +.Pq Event 27H , Umask 04H +LONGEST_LAT_CACH E.MISS +.It Li UOPS_DECODED.DEC0 +.Pq Event 3DH , Umask 01H +Counts micro-ops decoded by decoder 0. +.It Li UOPS_DECODED.DEC0 +.Pq Event 01H , Umask 01H +Counts L1 data cache store RFO requests where the cache line to be loaded is +in the I state. +Counter 0, 1 only +.It Li 0FH +.Pq Event 41H , Umask 41H +L1D_CACHE_ST.MESI +Counts L1 data cache store RFO requests. +Counter 0, 1 only +.It Li DTLB_MISSES.PDE_MISS +.Pq Event 49H , Umask 20H +Number of DTLB cache misses where the low part of the linear to physical +address translation was missed. +.It Li DTLB_MISSES.PDP_MISS +.Pq Event 49H , Umask 40H +Number of DTLB misses where the high part of the linear to physical address +translation was missed. +.It Li DTLB_MISSES.LARGE_WALK_COMPLETED +.Pq Event 49H , Umask 80H +Counts number of completed large page walks due to misses in the STLB. +.It Li SSE_MEM_EXEC.NTA +.Pq Event 4BH , Umask 01H +Counts number of SSE NTA prefetch/weakly-ordered instructions which missed +the L1 data cache. +.It Li SSE_MEM_EXEC.STREAMING_STORES +.Pq Event 4BH , Umask 08H +Counts number of SSE non temporal stores +.It Li SFENCE_CYCLES +.Pq Event 4DH , Umask 01H +Counts store fence cycles +.It Li EPT.EPDE_MISS +.Pq Event 4FH , Umask 02H +Counts Extended Page Directory Entry misses. The Extended Page Directory +cache is used by Virtual Machine operating systems while the guest operating +systems use the standard TLB caches. +.It Li EPT.EPDPE_HIT +.Pq Event 4FH , Umask 04H +Counts Extended Page Directory Pointer Entry hits. +.It Li EPT.EPDPE_MISS +.Pq Event 4FH , Umask 08H +Counts Extended Page Directory Pointer Entry misses. T +.It Li OFFCORE_REQUESTS_OUTSTANDING.DEMAND.READ_DATA +.Pq Event 60H , Umask 01H +Counts weighted cycles of offcore demand data read requests. Does not +include L2 prefetch requests. +counter 0 +.It Li OFFCORE_REQUESTS_OUTSTANDING.DEMAND.READ_CODE +.Pq Event 60H , Umask 02H +Counts weighted cycles of offcore demand code read requests. Does not +include L2 prefetch requests. +counter 0 +.It Li OFFCORE_REQUESTS_OUTSTANDING.DEMAND.RFO +.Pq Event 60H , Umask 04H +Counts weighted cycles of offcore demand RFO requests. Does not include L2 +prefetch requests. +counter 0 +.It Li OFFCORE_REQUESTS_OUTSTANDING.ANY.READ +.Pq Event 60H , Umask 08H +Counts weighted cycles of offcore read requests of any kind. Include L2 +prefetch requests. +counter 0 +.It Li IFU_IVC.FULL +.Pq Event 81H , Umask 01H +Instruction Fetche unit victim cache full. +.It Li IFU_IVC.L1I_EVICTION +.Pq Event 81H , Umask 02H +L1 Instruction cache evictions. +.It Li L1I_OPPORTUNISTIC_HITS +.Pq Event 83H , Umask 01H +Opportunistic hits in streaming. +.It Li ITLB_MISSES.WALK_CYCLES +.Pq Event 85H , Umask 04H +Counts ITLB miss page walk cycles. +.It Li ITLB_MISSES.PMH_BUSY_CYCLES +.Pq Event 85H , Umask 04H +Counts PMH busy cycles. +.It Li ITLB_MISSES.STLB_HIT +.Pq Event 85H , Umask 10H +Counts the number of ITLB misses that hit in the second level TLB. +.It Li ITLB_MISSES.PDE_MISS +.Pq Event 85H , Umask 20H +Number of ITLB misses where the low part of the linear to physical address +translation was missed. +.It Li ITLB_MISSES.PDP_MISS +.Pq Event 85H , Umask 40H +Number of ITLB misses where the high part of the linear to physical address +translation was missed. +.It Li ITLB_MISSES.LARGE_WALK_COMPLETED +.Pq Event 85H , Umask 80H +Counts number of completed large page walks due to misses in the STLB. +.It Li ITLB_MISSES.LARGE_WALK_COMPLETED +.Pq Event 01H , Umask 80H +Counts number of offcore demand data read requests. Does not count L2 +prefetch requests. +.It Li OFFCORE_REQUESTS.DEMAND.READ_CODE +.Pq Event B0H , Umask 02H +Counts number of offcore demand code read requests. Does not count L2 +prefetch requests. +.It Li OFFCORE_REQUESTS.DEMAND.RFO +.Pq Event B0H , Umask 04H +Counts number of offcore demand RFO requests. Does not count L2 prefetch +requests. +.It Li OFFCORE_REQUESTS.ANY.READ +.Pq Event B0H , Umask 08H +Counts number of offcore read requests. Includes L2 prefetch requests. +.It Li OFFCORE_REQUESTS.ANY.RFO +.Pq Event B0H , Umask 10H +Counts number of offcore RFO requests. Includes L2 prefetch requests. +.It Li OFFCORE_REQUESTS.UNCACHED_MEM +.Pq Event B0H , Umask 20H +Counts number of offcore uncached memory requests. +.It Li OFFCORE_REQUESTS.ANY +.Pq Event B0H , Umask 80H +Counts all offcore requests. +.It Li SNOOPQ_REQUESTS_OUTSTANDING.DATA +.Pq Event B3H , Umask 01H +Counts weighted cycles of snoopq requests for data. Counter 0 only +Use cmask=1 to count cycles not empty. +.It Li SNOOPQ_REQUESTS_OUTSTANDING.INVALIDATE +.Pq Event B3H , Umask 02H +Counts weighted cycles of snoopq invalidate requests. Counter 0 only +Use cmask=1 to count cycles not empty. +.It Li SNOOPQ_REQUESTS_OUTSTANDING.CODE +.Pq Event B3H , Umask 04H +Counts weighted cycles of snoopq requests for code. Counter 0 only +Use cmask=1 to count cycles not empty. +.It Li SNOOPQ_REQUESTS_OUTSTANDING.CODE +.Pq Event BAH , Umask 04H +Counts number of TPR reads +.It Li PIC_ACCESSES.TPR_WRITES +.Pq Event BAH , Umask 02H +Counts number of TPR writes +one or two micro-ops. Some instructions are decoded into longer sequences +.It Li MACHINE_CLEARS.FUSION_ASSIST +.Pq Event C3H , Umask 10H +Counts the number of macro-fusion assists +Counts SIMD packed single- precision floating point Uops retired. +.It Li BOGUS_BR +.Pq Event E4H , Umask 01H +Counts the number of bogus branches. +.It Li L2_HW_PREFETCH.HIT +.Pq Event F3H , Umask 01H +Count L2 HW prefetcher detector hits +.It Li L2_HW_PREFETCH.ALLOC +.Pq Event F3H , Umask 02H +Count L2 HW prefetcher allocations +.It Li L2_HW_PREFETCH.DATA_TRIGGER +.Pq Event F3H , Umask 04H +Count L2 HW data prefetcher triggered +.It Li L2_HW_PREFETCH.CODE_TRIGGER +.Pq Event F3H , Umask 08H +Count L2 HW code prefetcher triggered +.It Li L2_HW_PREFETCH.DCA_TRIGGER +.Pq Event F3H , Umask 10H +Count L2 HW DCA prefetcher triggered +.It Li L2_HW_PREFETCH.KICK_START +.Pq Event F3H , Umask 20H +Count L2 HW prefetcher kick started +.It Li SQ_MISC.PROMOTION +.Pq Event F4H , Umask 01H +Counts the number of L2 secondary misses that hit the Super Queue. +.It Li SQ_MISC.PROMOTION_POST_GO +.Pq Event F4H , Umask 02H +Counts the number of L2 secondary misses during the Super Queue filling L2. +.It Li SQ_MISC.LRU_HINTS +.Pq Event F4H , Umask 04H +Counts number of Super Queue LRU hints sent to L3. +.It Li SQ_MISC.FILL_DROPPED +.Pq Event F4H , Umask 08H +Counts the number of SQ L2 fills dropped due to L2 busy. +.It Li SEGMENT_REG_LOADS +.Pq Event F8H , Umask 01H +Counts number of segment register loads. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.iaf 3 , +.Xr pmc.ucf 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmc.corei7uc 3 , +.Xr pmc.westmere 3 , +.Xr pmc.westmereuc 3 , +.Xr pmc.tsc 3 , +.Xr pmc_cpuinfo 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . diff --git a/lib/libpmc/pmc.corei7uc.3 b/lib/libpmc/pmc.corei7uc.3 new file mode 100644 index 0000000..a69eab7 --- /dev/null +++ b/lib/libpmc/pmc.corei7uc.3 @@ -0,0 +1,880 @@ +.\" Copyright (c) 2010 Fabien Thomas. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd March 24, 2010 +.Dt PMC.COREI7UC 3 +.Os +.Sh NAME +.Nm pmc.corei7uc +.Nd uncore measurement events for +.Tn Intel +.Tn Core i7 and Xeon 5500 +family CPUs +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +.Tn Intel +.Tn "Core i7" +CPUs contain PMCs conforming to version 2 of the +.Tn Intel +performance measurement architecture. +These CPUs contain 2 classes of PMCs: +.Bl -tag -width "Li PMC_CLASS_UCP" +.It Li PMC_CLASS_UCF +Fixed-function counters that count only one hardware event per counter. +.It Li PMC_CLASS_UCP +Programmable counters that may be configured to count one of a defined +set of hardware events. +.El +.Pp +The number of PMCs available in each class and their widths need to be +determined at run time by calling +.Xr pmc_cpuinfo 3 . +.Pp +Intel Core i7 and Xeon 5500 PMCs are documented in +.Rs +.%B "Intel(R) 64 and IA-32 Architectures Software Developes Manual" +.%T "Volume 3B: System Programming Guide, Part 2" +.%N "Order Number: 253669-033US" +.%D December 2009 +.%Q "Intel Corporation" +.Re +.Ss COREI7 AND XEON 5500 UNCORE FIXED FUNCTION PMCS +These PMCs and their supported events are documented in +.Xr pmc.ucf 3 . +.Ss COREI7 AND XEON 5500 UNCORE PROGRAMMABLE PMCS +The programmable PMCs support the following capabilities: +.Bl -column "PMC_CAP_INTERRUPT" "Support" +.It Em Capability Ta Em Support +.It PMC_CAP_CASCADE Ta \&No +.It PMC_CAP_EDGE Ta Yes +.It PMC_CAP_INTERRUPT Ta \&No +.It PMC_CAP_INVERT Ta Yes +.It PMC_CAP_READ Ta Yes +.It PMC_CAP_PRECISE Ta \&No +.It PMC_CAP_SYSTEM Ta \&No +.It PMC_CAP_TAGGING Ta \&No +.It PMC_CAP_THRESHOLD Ta Yes +.It PMC_CAP_USER Ta \&No +.It PMC_CAP_WRITE Ta Yes +.El +.Ss Event Qualifiers +Event specifiers for these PMCs support the following common +qualifiers: +.Bl -tag -width indent +.It Li cmask= Ns Ar value +Configure the PMC to increment only if the number of configured +events measured in a cycle is greater than or equal to +.Ar value . +.It Li edge +Configure the PMC to count the number of de-asserted to asserted +transitions of the conditions expressed by the other qualifiers. +If specified, the counter will increment only once whenever a +condition becomes true, irrespective of the number of clocks during +which the condition remains true. +.It Li inv +Invert the sense of comparison when the +.Dq Li cmask +qualifier is present, making the counter increment when the number of +events per cycle is less than the value specified by the +.Dq Li cmask +qualifier. +.El +.Ss Event Specifiers (Programmable PMCs) +Core i7 and Xeon 5500 uncore programmable PMCs support the following events: +.Bl -tag -width indent +.It Li GQ_CYCLES_FULL.READ_TRACKER +.Pq Event 00H , Umask 01H +Uncore cycles Global Queue read tracker is full. +.It Li GQ_CYCLES_FULL.WRITE_TRACKER +.Pq Event 00H , Umask 02H +Uncore cycles Global Queue write tracker is full. +.It Li GQ_CYCLES_FULL.PEER_PROBE_TRACKER +.Pq Event 00H , Umask 04H +Uncore cycles Global Queue peer probe tracker is full. The peer probe +tracker queue tracks snoops from the IOH and remote sockets. +.It Li GQ_CYCLES_NOT_EMPTY.READ_TRACKER +.Pq Event 01H , Umask 01H +Uncore cycles were Global Queue read tracker has at least one valid entry. +.It Li GQ_CYCLES_NOT_EMPTY.WRITE_TRACKER +.Pq Event 01H , Umask 02H +Uncore cycles were Global Queue write tracker has at least one valid entry. +.It Li GQ_CYCLES_NOT_EMPTY.PEER_PROBE_TRACKER +.Pq Event 01H , Umask 04H +Uncore cycles were Global Queue peer probe tracker has at least one valid +entry. The peer probe tracker queue tracks IOH and remote socket snoops. +.It Li GQ_ALLOC.READ_TRACKER +.Pq Event 03H , Umask 01H +Counts the number of tread tracker allocate to deallocate entries. The GQ +read tracker allocate to deallocate occupancy count is divided by the count +to obtain the average read tracker latency. +.It Li GQ_ALLOC.RT_L3_MISS +.Pq Event 03H , Umask 02H +Counts the number GQ read tracker entries for which a full cache line read +has missed the L3. The GQ read tracker L3 miss to fill occupancy count is +divided by this count to obtain the average cache line read L3 miss latency. +The latency represents the time after which the L3 has determined that the +cache line has missed. The time between a GQ read tracker allocation and the +L3 determining that the cache line has missed is the average L3 hit latency. +The total L3 cache line read miss latency is the hit latency + L3 miss +latency. +.It Li GQ_ALLOC.RT_TO_L3_RESP +.Pq Event 03H , Umask 04H +Counts the number of GQ read tracker entries that are allocated in the read +tracker queue that hit or miss the L3. The GQ read tracker L3 hit occupancy +count is divided by this count to obtain the average L3 hit latency. +.It Li GQ_ALLOC.RT_TO_RTID_ACQUIRED +.Pq Event 03H , Umask 08H +Counts the number of GQ read tracker entries that are allocated in the read +tracker, have missed in the L3 and have not acquired a Request Transaction +ID. The GQ read tracker L3 miss to RTID acquired occupancy count is +divided by this count to obtain the average latency for a read L3 miss to +acquire an RTID. +.It Li GQ_ALLOC.WT_TO_RTID_ACQUIRED +.Pq Event 03H , Umask 10H +Counts the number of GQ write tracker entries that are allocated in the +write tracker, have missed in the L3 and have not acquired a Request +Transaction ID. The GQ write tracker L3 miss to RTID occupancy count is +divided by this count to obtain the average latency for a write L3 miss to +acquire an RTID. +.It Li GQ_ALLOC.WRITE_TRACKER +.Pq Event 03H , Umask 20H +Counts the number of GQ write tracker entries that are allocated in the +write tracker queue that miss the L3. The GQ write tracker occupancy count +is divided by the this count to obtain the average L3 write miss latency. +.It Li GQ_ALLOC.PEER_PROBE_TRACKER +.Pq Event 03H , Umask 40H +Counts the number of GQ peer probe tracker (snoop) entries that are +allocated in the peer probe tracker queue that miss the L3. The GQ peer +probe occupancy count is divided by this count to obtain the average L3 peer +probe miss latency. +.It Li GQ_DATA.FROM_QPI +.Pq Event 04H , Umask 01H +Cycles Global Queue Quickpath Interface input data port is busy importing +data from the Quickpath Interface. Each cycle the input port can transfer 8 +or 16 bytes of data. +.It Li GQ_DATA.FROM_QMC +.Pq Event 04H , Umask 02H +Cycles Global Queue Quickpath Memory Interface input data port is busy +importing data from the Quickpath Memory Interface. Each cycle the input +port can transfer 8 or 16 bytes of data. +.It Li GQ_DATA.FROM_L3 +.Pq Event 04H , Umask 04H +Cycles GQ L3 input data port is busy importing data from the Last Level +Cache. Each cycle the input port can transfer 32 bytes of data. +.It Li GQ_DATA.FROM_CORES_02 +.Pq Event 04H , Umask 08H +Cycles GQ Core 0 and 2 input data port is busy importing data from processor +cores 0 and 2. Each cycle the input port can transfer 32 bytes of data. +.It Li GQ_DATA.FROM_CORES_13 +.Pq Event 04H , Umask 10H +Cycles GQ Core 1 and 3 input data port is busy importing data from processor +cores 1 and 3. Each cycle the input port can transfer 32 bytes of data. +.It Li GQ_DATA.TO_QPI_QMC +.Pq Event 05H , Umask 01H +Cycles GQ QPI and QMC output data port is busy sending data to the Quickpath +Interface or Quickpath Memory Interface. Each cycle the output port can +transfer 32 bytes of data. +.It Li GQ_DATA.TO_L3 +.Pq Event 05H , Umask 02H +Cycles GQ L3 output data port is busy sending data to the Last Level Cache. +Each cycle the output port can transfer 32 bytes of data. +.It Li GQ_DATA.TO_CORES +.Pq Event 05H , Umask 04H +Cycles GQ Core output data port is busy sending data to the Cores. Each +cycle the output port can transfer 32 bytes of data. +.It Li SNP_RESP_TO_LOCAL_HOME.I_STATE +.Pq Event 06H , Umask 01H +Number of snoop responses to the local home that L3 does not have the +referenced cache line. +.It Li SNP_RESP_TO_LOCAL_HOME.S_STATE +.Pq Event 06H , Umask 02H +Number of snoop responses to the local home that L3 has the referenced line +cached in the S state. +.It Li SNP_RESP_TO_LOCAL_HOME.FWD_S_STATE +.Pq Event 06H , Umask 04H +Number of responses to code or data read snoops to the local home that the +L3 has the referenced cache line in the E state. The L3 cache line state is +changed to the S state and the line is forwarded to the local home in the S +state. +.It Li SNP_RESP_TO_LOCAL_HOME.FWD_I_STATE +.Pq Event 06H , Umask 08H +Number of responses to read invalidate snoops to the local home that the L3 +has the referenced cache line in the M state. The L3 cache line state is +invalidated and the line is forwarded to the local home in the M state. +.It Li SNP_RESP_TO_LOCAL_HOME.CONFLICT +.Pq Event 06H , Umask 10H +Number of conflict snoop responses sent to the local home. +.It Li SNP_RESP_TO_LOCAL_HOME.WB +.Pq Event 06H , Umask 20H +Number of responses to code or data read snoops to the local home that the +L3 has the referenced line cached in the M state. +.It Li SNP_RESP_TO_REMOTE_HOME.I_STATE +.Pq Event 07H , Umask 01H +Number of snoop responses to a remote home that L3 does not have the +referenced cache line. +.It Li SNP_RESP_TO_REMOTE_HOME.S_STATE +.Pq Event 07H , Umask 02H +Number of snoop responses to a remote home that L3 has the referenced line +cached in the S state. +.It Li SNP_RESP_TO_REMOTE_HOME.FWD_S_STATE +.Pq Event 07H , Umask 04H +Number of responses to code or data read snoops to a remote home that the L3 +has the referenced cache line in the E state. The L3 cache line state is +changed to the S state and the line is forwarded to the remote home in the S +state. +.It Li SNP_RESP_TO_REMOTE_HOME.FWD_I_STATE +.Pq Event 07H , Umask 08H +Number of responses to read invalidate snoops to a remote home that the L3 +has the referenced cache line in the M state. The L3 cache line state is +invalidated and the line is forwarded to the remote home in the M state. +.It Li SNP_RESP_TO_REMOTE_HOME.CONFLICT +.Pq Event 07H , Umask 10H +Number of conflict snoop responses sent to the local home. +.It Li SNP_RESP_TO_REMOTE_HOME.WB +.Pq Event 07H , Umask 20H +Number of responses to code or data read snoops to a remote home that the L3 +has the referenced line cached in the M state. +.It Li SNP_RESP_TO_REMOTE_HOME.HITM +.Pq Event 07H , Umask 24H +Number of HITM snoop responses to a remote home +.It Li L3_HITS.READ +.Pq Event 08H , Umask 01H +Number of code read, data read and RFO requests that hit in the L3 +.It Li L3_HITS.WRITE +.Pq Event 08H , Umask 02H +Number of writeback requests that hit in the L3. Writebacks from the cores +will always result in L3 hits due to the inclusive property of the L3. +.It Li L3_HITS.PROBE +.Pq Event 08H , Umask 04H +Number of snoops from IOH or remote sockets that hit in the L3. +.It Li L3_HITS.ANY +.Pq Event 08H , Umask 03H +Number of reads and writes that hit the L3. +.It Li L3_MISS.READ +.Pq Event 09H , Umask 01H +Number of code read, data read and RFO requests that miss the L3. +.It Li L3_MISS.WRITE +.Pq Event 09H , Umask 02H +Number of writeback requests that miss the L3. Should always be zero as +writebacks from the cores will always result in L3 hits due to the inclusive +property of the L3. +.It Li L3_MISS.PROBE +.Pq Event 09H , Umask 04H +Number of snoops from IOH or remote sockets that miss the L3. +.It Li L3_MISS.ANY +.Pq Event 09H , Umask 03H +Number of reads and writes that miss the L3. +.It Li L3_LINES_IN.M_STATE +.Pq Event 0AH , Umask 01H +Counts the number of L3 lines allocated in M state. The only time a cache +line is allocated in the M state is when the line was forwarded in M state +is forwarded due to a Snoop Read Invalidate Own request. +.It Li L3_LINES_IN.E_STATE +.Pq Event 0AH , Umask 02H +Counts the number of L3 lines allocated in E state. +.It Li L3_LINES_IN.S_STATE +.Pq Event 0AH , Umask 04H +Counts the number of L3 lines allocated in S state. +.It Li L3_LINES_IN.F_STATE +.Pq Event 0AH , Umask 08H +Counts the number of L3 lines allocated in F state. +.It Li L3_LINES_IN.ANY +.Pq Event 0AH , Umask 0FH +Counts the number of L3 lines allocated in any state. +.It Li L3_LINES_OUT.M_STATE +.Pq Event 0BH , Umask 01H +Counts the number of L3 lines victimized that were in the M state. When the +victim cache line is in M state, the line is written to its home cache agent +which can be either local or remote. +.It Li L3_LINES_OUT.E_STATE +.Pq Event 0BH , Umask 02H +Counts the number of L3 lines victimized that were in the E state. +.It Li L3_LINES_OUT.S_STATE +.Pq Event 0BH , Umask 04H +Counts the number of L3 lines victimized that were in the S state. +.It Li L3_LINES_OUT.I_STATE +.Pq Event 0BH , Umask 08H +Counts the number of L3 lines victimized that were in the I state. +.It Li L3_LINES_OUT.F_STATE +.Pq Event 0BH , Umask 10H +Counts the number of L3 lines victimized that were in the F state. +.It Li L3_LINES_OUT.ANY +.Pq Event 0BH , Umask 1FH +Counts the number of L3 lines victimized in any state. +.It Li QHL_REQUESTS.IOH_READS +.Pq Event 20H , Umask 01H +Counts number of Quickpath Home Logic read requests from the IOH. +.It Li QHL_REQUESTS.IOH_WRITES +.Pq Event 20H , Umask 02H +Counts number of Quickpath Home Logic write requests from the IOH. +.It Li QHL_REQUESTS.REMOTE_READS +.Pq Event 20H , Umask 04H +Counts number of Quickpath Home Logic read requests from a remote socket. +.It Li QHL_REQUESTS.REMOTE_WRITES +.Pq Event 20H , Umask 08H +Counts number of Quickpath Home Logic write requests from a remote socket. +.It Li QHL_REQUESTS.LOCAL_READS +.Pq Event 20H , Umask 10H +Counts number of Quickpath Home Logic read requests from the local socket. +.It Li QHL_REQUESTS.LOCAL_WRITES +.Pq Event 20H , Umask 20H +Counts number of Quickpath Home Logic write requests from the local socket. +.It Li QHL_CYCLES_FULL.IOH +.Pq Event 21H , Umask 01H +Counts uclk cycles all entries in the Quickpath Home Logic IOH are full. +.It Li QHL_CYCLES_FULL.REMOTE +.Pq Event 21H , Umask 02H +Counts uclk cycles all entries in the Quickpath Home Logic remote tracker +are full. +.It Li QHL_CYCLES_FULL.LOCAL +.Pq Event 21H , Umask 04H +Counts uclk cycles all entries in the Quickpath Home Logic local tracker are +full. +.It Li QHL_CYCLES_NOT_EMPTY.IOH +.Pq Event 22H , Umask 01H +Counts uclk cycles all entries in the Quickpath Home Logic IOH is busy. +.It Li QHL_CYCLES_NOT_EMPTY.REMOTE +.Pq Event 22H , Umask 02H +Counts uclk cycles all entries in the Quickpath Home Logic remote tracker is +busy. +.It Li QHL_CYCLES_NOT_EMPTY.LOCAL +.Pq Event 22H , Umask 04H +Counts uclk cycles all entries in the Quickpath Home Logic local tracker is +busy. +.It Li QHL_OCCUPANCY.IOH +.Pq Event 23H , Umask 01H +QHL IOH tracker allocate to deallocate read occupancy. +.It Li QHL_OCCUPANCY.REMOTE +.Pq Event 23H , Umask 02H +QHL remote tracker allocate to deallocate read occupancy. +.It Li QHL_OCCUPANCY.LOCAL +.Pq Event 23H , Umask 04H +QHL local tracker allocate to deallocate read occupancy. +.It Li QHL_ADDRESS_CONFLICTS.2WAY +.Pq Event 24H , Umask 02H +Counts number of QHL Active Address Table (AAT) entries that saw a max of 2 +conflicts. The AAT is a structure that tracks requests that are in conflict. +The requests themselves are in the home tracker entries. The count is +reported when an AAT entry deallocates. +.It Li QHL_ADDRESS_CONFLICTS.3WAY +.Pq Event 24H , Umask 04H +Counts number of QHL Active Address Table (AAT) entries that saw a max of 3 +conflicts. The AAT is a structure that tracks requests that are in conflict. +The requests themselves are in the home tracker entries. The count is +reported when an AAT entry deallocates. +.It Li QHL_CONFLICT_CYCLES.IOH +.Pq Event 25H , Umask 01H +Counts cycles the Quickpath Home Logic IOH Tracker contains two or more +requests with an address conflict. A max of 3 requests can be in conflict. +.It Li QHL_CONFLICT_CYCLES.REMOTE +.Pq Event 25H , Umask 02H +Counts cycles the Quickpath Home Logic Remote Tracker contains two or more +requests with an address conflict. A max of 3 requests can be in conflict. +.It Li QHL_CONFLICT_CYCLES.LOCAL +.Pq Event 25H , Umask 04H +Counts cycles the Quickpath Home Logic Local Tracker contains two or more +requests with an address conflict. A max of 3 requests can be in conflict. +.It Li QHL_TO_QMC_BYPASS +.Pq Event 26H , Umask 01H +Counts number or requests to the Quickpath Memory Controller that bypass the +Quickpath Home Logic. All local accesses can be bypassed. For remote +requests, only read requests can be bypassed. +.It Li QMC_NORMAL_FULL.READ.CH0 +.Pq Event 27H , Umask 01H +Uncore cycles all the entries in the DRAM channel 0 medium or low priority +queue are occupied with read requests. +.It Li QMC_NORMAL_FULL.READ.CH1 +.Pq Event 27H , Umask 02H +Uncore cycles all the entries in the DRAM channel 1 medium or low priority +queue are occupied with read requests. +.It Li QMC_NORMAL_FULL.READ.CH2 +.Pq Event 27H , Umask 04H +Uncore cycles all the entries in the DRAM channel 2 medium or low priority +queue are occupied with read requests. +.It Li QMC_NORMAL_FULL.WRITE.CH0 +.Pq Event 27H , Umask 08H +Uncore cycles all the entries in the DRAM channel 0 medium or low priority +queue are occupied with write requests. +.It Li QMC_NORMAL_FULL.WRITE.CH1 +.Pq Event 27H , Umask 10H +Counts cycles all the entries in the DRAM channel 1 medium or low priority +queue are occupied with write requests. +.It Li QMC_NORMAL_FULL.WRITE.CH2 +.Pq Event 27H , Umask 20H +Uncore cycles all the entries in the DRAM channel 2 medium or low priority +queue are occupied with write requests. +.It Li QMC_ISOC_FULL.READ.CH0 +.Pq Event 28H , Umask 01H +Counts cycles all the entries in the DRAM channel 0 high priority queue are +occupied with isochronous read requests. +.It Li QMC_ISOC_FULL.READ.CH1 +.Pq Event 28H , Umask 02H +Counts cycles all the entries in the DRAM channel 1 high priority queue are +occupied with isochronous read requests. +.It Li QMC_ISOC_FULL.READ.CH2 +.Pq Event 28H , Umask 04H +Counts cycles all the entries in the DRAM channel 2 high priority queue are +occupied with isochronous read requests. +.It Li QMC_ISOC_FULL.WRITE.CH0 +.Pq Event 28H , Umask 08H +Counts cycles all the entries in the DRAM channel 0 high priority queue are +occupied with isochronous write requests. +.It Li QMC_ISOC_FULL.WRITE.CH1 +.Pq Event 28H , Umask 10H +Counts cycles all the entries in the DRAM channel 1 high priority queue are +occupied with isochronous write requests. +.It Li QMC_ISOC_FULL.WRITE.CH2 +.Pq Event 28H , Umask 20H +Counts cycles all the entries in the DRAM channel 2 high priority queue are +occupied with isochronous write requests. +.It Li QMC_BUSY.READ.CH0 +.Pq Event 29H , Umask 01H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +read request to DRAM channel 0. +.It Li QMC_BUSY.READ.CH1 +.Pq Event 29H , Umask 02H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +read request to DRAM channel 1. +.It Li QMC_BUSY.READ.CH2 +.Pq Event 29H , Umask 04H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +read request to DRAM channel 2. +.It Li QMC_BUSY.WRITE.CH0 +.Pq Event 29H , Umask 08H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +write request to DRAM channel 0. +.It Li QMC_BUSY.WRITE.CH1 +.Pq Event 29H , Umask 10H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +write request to DRAM channel 1. +.It Li QMC_BUSY.WRITE.CH2 +.Pq Event 29H , Umask 20H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +write request to DRAM channel 2. +.It Li QMC_OCCUPANCY.CH0 +.Pq Event 2AH , Umask 01H +IMC channel 0 normal read request occupancy. +.It Li QMC_OCCUPANCY.CH1 +.Pq Event 2AH , Umask 02H +IMC channel 1 normal read request occupancy. +.It Li QMC_OCCUPANCY.CH2 +.Pq Event 2AH , Umask 04H +IMC channel 2 normal read request occupancy. +.It Li QMC_ISSOC_OCCUPANCY.CH0 +.Pq Event 2BH , Umask 01H +IMC channel 0 issoc read request occupancy. +.It Li QMC_ISSOC_OCCUPANCY.CH1 +.Pq Event 2BH , Umask 02H +IMC channel 1 issoc read request occupancy. +.It Li QMC_ISSOC_OCCUPANCY.CH2 +.Pq Event 2BH , Umask 04H +IMC channel 2 issoc read request occupancy. +.It Li QMC_ISSOC_READS.ANY +.Pq Event 2BH , Umask 07H +IMC issoc read request occupancy. +.It Li QMC_NORMAL_READS.CH0 +.Pq Event 2CH , Umask 01H +Counts the number of Quickpath Memory Controller channel 0 medium and low +priority read requests. The QMC channel 0 normal read occupancy divided by +this count provides the average QMC channel 0 read latency. +.It Li QMC_NORMAL_READS.CH1 +.Pq Event 2CH , Umask 02H +Counts the number of Quickpath Memory Controller channel 1 medium and low +priority read requests. The QMC channel 1 normal read occupancy divided by +this count provides the average QMC channel 1 read latency. +.It Li QMC_NORMAL_READS.CH2 +.Pq Event 2CH , Umask 04H +Counts the number of Quickpath Memory Controller channel 2 medium and low +priority read requests. The QMC channel 2 normal read occupancy divided by +this count provides the average QMC channel 2 read latency. +.It Li QMC_NORMAL_READS.ANY +.Pq Event 2CH , Umask 07H +Counts the number of Quickpath Memory Controller medium and low priority +read requests. The QMC normal read occupancy divided by this count provides +the average QMC read latency. +.It Li QMC_HIGH_PRIORITY_READS.CH0 +.Pq Event 2DH , Umask 01H +Counts the number of Quickpath Memory Controller channel 0 high priority +isochronous read requests. +.It Li QMC_HIGH_PRIORITY_READS.CH1 +.Pq Event 2DH , Umask 02H +Counts the number of Quickpath Memory Controller channel 1 high priority +isochronous read requests. +.It Li QMC_HIGH_PRIORITY_READS.CH2 +.Pq Event 2DH , Umask 04H +Counts the number of Quickpath Memory Controller channel 2 high priority +isochronous read requests. +.It Li QMC_HIGH_PRIORITY_READS.ANY +.Pq Event 2DH , Umask 07H +Counts the number of Quickpath Memory Controller high priority isochronous +read requests. +.It Li QMC_CRITICAL_PRIORITY_READS.CH0 +.Pq Event 2EH , Umask 01H +Counts the number of Quickpath Memory Controller channel 0 critical priority +isochronous read requests. +.It Li QMC_CRITICAL_PRIORITY_READS.CH1 +.Pq Event 2EH , Umask 02H +Counts the number of Quickpath Memory Controller channel 1 critical priority +isochronous read requests. +.It Li QMC_CRITICAL_PRIORITY_READS.CH2 +.Pq Event 2EH , Umask 04H +Counts the number of Quickpath Memory Controller channel 2 critical priority +isochronous read requests. +.It Li QMC_CRITICAL_PRIORITY_READS.ANY +.Pq Event 2EH , Umask 07H +Counts the number of Quickpath Memory Controller critical priority +isochronous read requests. +.It Li QMC_WRITES.FULL.CH0 +.Pq Event 2FH , Umask 01H +Counts number of full cache line writes to DRAM channel 0. +.It Li QMC_WRITES.FULL.CH1 +.Pq Event 2FH , Umask 02H +Counts number of full cache line writes to DRAM channel 1. +.It Li QMC_WRITES.FULL.CH2 +.Pq Event 2FH , Umask 04H +Counts number of full cache line writes to DRAM channel 2. +.It Li QMC_WRITES.FULL.ANY +.Pq Event 2FH , Umask 07H +Counts number of full cache line writes to DRAM. +.It Li QMC_WRITES.PARTIAL.CH0 +.Pq Event 2FH , Umask 08H +Counts number of partial cache line writes to DRAM channel 0. +.It Li QMC_WRITES.PARTIAL.CH1 +.Pq Event 2FH , Umask 10H +Counts number of partial cache line writes to DRAM channel 1. +.It Li QMC_WRITES.PARTIAL.CH2 +.Pq Event 2FH , Umask 20H +Counts number of partial cache line writes to DRAM channel 2. +.It Li QMC_WRITES.PARTIAL.ANY +.Pq Event 2FH , Umask 38H +Counts number of partial cache line writes to DRAM. +.It Li QMC_CANCEL.CH0 +.Pq Event 30H , Umask 01H +Counts number of DRAM channel 0 cancel requests. +.It Li QMC_CANCEL.CH1 +.Pq Event 30H , Umask 02H +Counts number of DRAM channel 1 cancel requests. +.It Li QMC_CANCEL.CH2 +.Pq Event 30H , Umask 04H +Counts number of DRAM channel 2 cancel requests. +.It Li QMC_CANCEL.ANY +.Pq Event 30H , Umask 07H +Counts number of DRAM cancel requests. +.It Li QMC_PRIORITY_UPDATES.CH0 +.Pq Event 31H , Umask 01H +Counts number of DRAM channel 0 priority updates. A priority update occurs +when an ISOC high or critical request is received by the QHL and there is a +matching request with normal priority that has already been issued to the +QMC. In this instance, the QHL will send a priority update to QMC to +expedite the request. +.It Li QMC_PRIORITY_UPDATES.CH1 +.Pq Event 31H , Umask 02H +Counts number of DRAM channel 1 priority updates. A priority update occurs +when an ISOC high or critical request is received by the QHL and there is a +matching request with normal priority that has already been issued to the +QMC. In this instance, the QHL will send a priority update to QMC to +expedite the request. +.It Li QMC_PRIORITY_UPDATES.CH2 +.Pq Event 31H , Umask 04H +Counts number of DRAM channel 2 priority updates. A priority update occurs +when an ISOC high or critical request is received by the QHL and there is a +matching request with normal priority that has already been issued to the +QMC. In this instance, the QHL will send a priority update to QMC to +expedite the request. +.It Li QMC_PRIORITY_UPDATES.ANY +.Pq Event 31H , Umask 07H +Counts number of DRAM priority updates. A priority update occurs when an +ISOC high or critical request is received by the QHL and there is a matching +request with normal priority that has already been issued to the QMC. In +this instance, the QHL will send a priority update to QMC to expedite the +request. +.It Li QHL_FRC_ACK_CNFLTS.LOCAL +.Pq Event 33H , Umask 04H +Counts number of Force Acknowledge Conflict messages sent by the Quickpath +Home Logic to the local home. +.It Li QPI_TX_STALLED_SINGLE_FLIT.HOME.LINK_0 +.Pq Event 40H , Umask 01H +Counts cycles the Quickpath outbound link 0 HOME virtual channel is stalled +due to lack of a VNA and VN0 credit. Note that this event does not filter +out when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.SNOOP.LINK_0 +.Pq Event 40H , Umask 02H +Counts cycles the Quickpath outbound link 0 SNOOP virtual channel is stalled +due to lack of a VNA and VN0 credit. Note that this event does not filter +out when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.NDR.LINK_0 +.Pq Event 40H , Umask 04H +Counts cycles the Quickpath outbound link 0 non-data response virtual +channel is stalled due to lack of a VNA and VN0 credit. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.HOME.LINK_1 +.Pq Event 40H , Umask 08H +Counts cycles the Quickpath outbound link 1 HOME virtual channel is stalled +due to lack of a VNA and VN0 credit. Note that this event does not filter +out when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.SNOOP.LINK_1 +.Pq Event 40H , Umask 10H +Counts cycles the Quickpath outbound link 1 SNOOP virtual channel is stalled +due to lack of a VNA and VN0 credit. Note that this event does not filter +out when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.NDR.LINK_1 +.Pq Event 40H , Umask 20H +Counts cycles the Quickpath outbound link 1 non-data response virtual +channel is stalled due to lack of a VNA and VN0 credit. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.LINK_0 +.Pq Event 40H , Umask 07H +Counts cycles the Quickpath outbound link 0 virtual channels are stalled due +to lack of a VNA and VN0 credit. Note that this event does not filter out +when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.LINK_1 +.Pq Event 40H , Umask 38H +Counts cycles the Quickpath outbound link 1 virtual channels are stalled due +to lack of a VNA and VN0 credit. Note that this event does not filter out +when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.DRS.LINK_0 +.Pq Event 41H , Umask 01H +Counts cycles the Quickpath outbound link 0 Data ResponSe virtual channel is +stalled due to lack of VNA and VN0 credits. Note that this event does not +filter out when a flit would not have been selected for arbitration because +another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.NCB.LINK_0 +.Pq Event 41H , Umask 02H +Counts cycles the Quickpath outbound link 0 Non-Coherent Bypass virtual +channel is stalled due to lack of VNA and VN0 credits. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.NCS.LINK_0 +.Pq Event 41H , Umask 04H +Counts cycles the Quickpath outbound link 0 Non-Coherent Standard virtual +channel is stalled due to lack of VNA and VN0 credits. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.DRS.LINK_1 +.Pq Event 41H , Umask 08H +Counts cycles the Quickpath outbound link 1 Data ResponSe virtual channel is +stalled due to lack of VNA and VN0 credits. Note that this event does not +filter out when a flit would not have been selected for arbitration because +another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.NCB.LINK_1 +.Pq Event 41H , Umask 10H +Counts cycles the Quickpath outbound link 1 Non-Coherent Bypass virtual +channel is stalled due to lack of VNA and VN0 credits. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.NCS.LINK_1 +.Pq Event 41H , Umask 20H +Counts cycles the Quickpath outbound link 1 Non-Coherent Standard virtual +channel is stalled due to lack of VNA and VN0 credits. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.LINK_0 +.Pq Event 41H , Umask 07H +Counts cycles the Quickpath outbound link 0 virtual channels are stalled due +to lack of VNA and VN0 credits. Note that this event does not filter out +when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.LINK_1 +.Pq Event 41H , Umask 38H +Counts cycles the Quickpath outbound link 1 virtual channels are stalled due +to lack of VNA and VN0 credits. Note that this event does not filter out +when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_HEADER.BUSY.LINK_0 +.Pq Event 42H , Umask 02H +Number of cycles that the header buffer in the Quickpath Interface outbound +link 0 is busy. +.It Li QPI_TX_HEADER.BUSY.LINK_1 +.Pq Event 42H , Umask 08H +Number of cycles that the header buffer in the Quickpath Interface outbound +link 1 is busy. +.It Li QPI_RX_NO_PPT_CREDIT.STALLS.LINK_0 +.Pq Event 43H , Umask 01H +Number of cycles that snoop packets incoming to the Quickpath Interface link +0 are stalled and not sent to the GQ because the GQ Peer Probe Tracker (PPT) +does not have any available entries. +.It Li QPI_RX_NO_PPT_CREDIT.STALLS.LINK_1 +.Pq Event 43H , Umask 02H +Number of cycles that snoop packets incoming to the Quickpath Interface link +1 are stalled and not sent to the GQ because the GQ Peer Probe Tracker (PPT) +does not have any available entries. +.It Li DRAM_OPEN.CH0 +.Pq Event 60H , Umask 01H +Counts number of DRAM Channel 0 open commands issued either for read or +write. To read or write data, the referenced DRAM page must first be opened. +.It Li DRAM_OPEN.CH1 +.Pq Event 60H , Umask 02H +Counts number of DRAM Channel 1 open commands issued either for read or +write. To read or write data, the referenced DRAM page must first be opened. +.It Li DRAM_OPEN.CH2 +.Pq Event 60H , Umask 04H +Counts number of DRAM Channel 2 open commands issued either for read or +write. To read or write data, the referenced DRAM page must first be opened. +.It Li DRAM_PAGE_CLOSE.CH0 +.Pq Event 61H , Umask 01H +DRAM channel 0 command issued to CLOSE a page due to page idle timer +expiration. Closing a page is done by issuing a precharge. +.It Li DRAM_PAGE_CLOSE.CH1 +.Pq Event 61H , Umask 02H +DRAM channel 1 command issued to CLOSE a page due to page idle timer +expiration. Closing a page is done by issuing a precharge. +.It Li DRAM_PAGE_CLOSE.CH2 +.Pq Event 61H , Umask 04H +DRAM channel 2 command issued to CLOSE a page due to page idle timer +expiration. Closing a page is done by issuing a precharge. +.It Li DRAM_PAGE_MISS.CH0 +.Pq Event 62H , Umask 01H +Counts the number of precharges (PRE) that were issued to DRAM channel 0 +because there was a page miss. A page miss refers to a situation in which a +page is currently open and another page from the same bank needs to be +opened. The new page experiences a page miss. Closing of the old page is +done by issuing a precharge. +.It Li DRAM_PAGE_MISS.CH1 +.Pq Event 62H , Umask 02H +Counts the number of precharges (PRE) that were issued to DRAM channel 1 +because there was a page miss. A page miss refers to a situation in which a +page is currently open and another page from the same bank needs to be +opened. The new page experiences a page miss. Closing of the old page is +done by issuing a precharge. +.It Li DRAM_PAGE_MISS.CH2 +.Pq Event 62H , Umask 04H +Counts the number of precharges (PRE) that were issued to DRAM channel 2 +because there was a page miss. A page miss refers to a situation in which a +page is currently open and another page from the same bank needs to be +opened. The new page experiences a page miss. Closing of the old page is +done by issuing a precharge. +.It Li DRAM_READ_CAS.CH0 +.Pq Event 63H , Umask 01H +Counts the number of times a read CAS command was issued on DRAM channel 0. +.It Li DRAM_READ_CAS.AUTOPRE_CH0 +.Pq Event 63H , Umask 02H +Counts the number of times a read CAS command was issued on DRAM channel 0 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_READ_CAS.CH1 +.Pq Event 63H , Umask 04H +Counts the number of times a read CAS command was issued on DRAM channel 1. +.It Li DRAM_READ_CAS.AUTOPRE_CH1 +.Pq Event 63H , Umask 08H +Counts the number of times a read CAS command was issued on DRAM channel 1 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_READ_CAS.CH2 +.Pq Event 63H , Umask 10H +Counts the number of times a read CAS command was issued on DRAM channel 2. +.It Li DRAM_READ_CAS.AUTOPRE_CH2 +.Pq Event 63H , Umask 20H +Counts the number of times a read CAS command was issued on DRAM channel 2 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_WRITE_CAS.CH0 +.Pq Event 64H , Umask 01H +Counts the number of times a write CAS command was issued on DRAM channel 0. +.It Li DRAM_WRITE_CAS.AUTOPRE_CH0 +.Pq Event 64H , Umask 02H +Counts the number of times a write CAS command was issued on DRAM channel 0 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_WRITE_CAS.CH1 +.Pq Event 64H , Umask 04H +Counts the number of times a write CAS command was issued on DRAM channel 1. +.It Li DRAM_WRITE_CAS.AUTOPRE_CH1 +.Pq Event 64H , Umask 08H +Counts the number of times a write CAS command was issued on DRAM channel 1 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_WRITE_CAS.CH2 +.Pq Event 64H , Umask 10H +Counts the number of times a write CAS command was issued on DRAM channel 2. +.It Li DRAM_WRITE_CAS.AUTOPRE_CH2 +.Pq Event 64H , Umask 20H +Counts the number of times a write CAS command was issued on DRAM channel 2 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_REFRESH.CH0 +.Pq Event 65H , Umask 01H +Counts number of DRAM channel 0 refresh commands. DRAM loses data content +over time. In order to keep correct data content, the data values have to be +refreshed periodically. +.It Li DRAM_REFRESH.CH1 +.Pq Event 65H , Umask 02H +Counts number of DRAM channel 1 refresh commands. DRAM loses data content +over time. In order to keep correct data content, the data values have to be +refreshed periodically. +.It Li DRAM_REFRESH.CH2 +.Pq Event 65H , Umask 04H +Counts number of DRAM channel 2 refresh commands. DRAM loses data content +over time. In order to keep correct data content, the data values have to be +refreshed periodically. +.It Li DRAM_PRE_ALL.CH0 +.Pq Event 66H , Umask 01H +Counts number of DRAM Channel 0 precharge-all (PREALL) commands that close +all open pages in a rank. PREALL is issued when the DRAM needs to be +refreshed or needs to go into a power down mode. +.It Li DRAM_PRE_ALL.CH1 +.Pq Event 66H , Umask 02H +Counts number of DRAM Channel 1 precharge-all (PREALL) commands that close +all open pages in a rank. PREALL is issued when the DRAM needs to be +refreshed or needs to go into a power down mode. +.It Li DRAM_PRE_ALL.CH2 +.Pq Event 66H , Umask 04H +Counts number of DRAM Channel 2 precharge-all (PREALL) commands that close +all open pages in a rank. PREALL is issued when the DRAM needs to be +refreshed or needs to go into a power down mode. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.iaf 3 , +.Xr pmc.ucf 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmc.corei7 3 , +.Xr pmc.westmere 3 , +.Xr pmc.westmereuc 3 , +.Xr pmc.tsc 3 , +.Xr pmc_cpuinfo 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . diff --git a/lib/libpmc/pmc.h b/lib/libpmc/pmc.h new file mode 100644 index 0000000..5b1ad07 --- /dev/null +++ b/lib/libpmc/pmc.h @@ -0,0 +1,112 @@ +/*- + * Copyright (c) 2003,2004 Joseph Koshy + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PMC_H_ +#define _PMC_H_ + +#include <sys/cdefs.h> +#include <sys/pmc.h> + +/* + * Driver statistics. + */ +struct pmc_driverstats { + int pm_intr_ignored; /* #interrupts ignored */ + int pm_intr_processed; /* #interrupts processed */ + int pm_intr_bufferfull; /* #interrupts with ENOSPC */ + int pm_syscalls; /* #syscalls */ + int pm_syscall_errors; /* #syscalls with errors */ + int pm_buffer_requests; /* #buffer requests */ + int pm_buffer_requests_failed; /* #failed buffer requests */ + int pm_log_sweeps; /* #sample buffer processing passes */ +}; + +/* + * CPU information. + */ +struct pmc_cpuinfo { + enum pmc_cputype pm_cputype; /* the kind of CPU */ + uint32_t pm_ncpu; /* number of CPUs */ + uint32_t pm_npmc; /* #PMCs per CPU */ + uint32_t pm_nclass; /* #classes of PMCs */ + struct pmc_classinfo pm_classes[PMC_CLASS_MAX]; +}; + +/* + * Current PMC state. + */ +struct pmc_pmcinfo { + int32_t pm_cpu; /* CPU number */ + struct pmc_info pm_pmcs[]; /* NPMC structs */ +}; + +/* + * Prototypes + */ + +__BEGIN_DECLS +int pmc_allocate(const char *_ctrspec, enum pmc_mode _mode, uint32_t _flags, + int _cpu, pmc_id_t *_pmcid); +int pmc_attach(pmc_id_t _pmcid, pid_t _pid); +int pmc_capabilities(pmc_id_t _pmc, uint32_t *_caps); +int pmc_configure_logfile(int _fd); +int pmc_flush_logfile(void); +int pmc_detach(pmc_id_t _pmcid, pid_t _pid); +int pmc_disable(int _cpu, int _pmc); +int pmc_enable(int _cpu, int _pmc); +int pmc_get_driver_stats(struct pmc_driverstats *_gms); +int pmc_get_msr(pmc_id_t _pmc, uint32_t *_msr); +int pmc_init(void); +int pmc_read(pmc_id_t _pmc, pmc_value_t *_value); +int pmc_release(pmc_id_t _pmc); +int pmc_rw(pmc_id_t _pmc, pmc_value_t _newvalue, pmc_value_t *_oldvalue); +int pmc_set(pmc_id_t _pmc, pmc_value_t _value); +int pmc_start(pmc_id_t _pmc); +int pmc_stop(pmc_id_t _pmc); +int pmc_width(pmc_id_t _pmc, uint32_t *_width); +int pmc_write(pmc_id_t _pmc, pmc_value_t _value); +int pmc_writelog(uint32_t _udata); + +int pmc_ncpu(void); +int pmc_npmc(int _cpu); +int pmc_cpuinfo(const struct pmc_cpuinfo **_cpu_info); +int pmc_pmcinfo(int _cpu, struct pmc_pmcinfo **_pmc_info); + +const char *pmc_name_of_capability(uint32_t _c); +const char *pmc_name_of_class(enum pmc_class _pc); +const char *pmc_name_of_cputype(enum pmc_cputype _cp); +const char *pmc_name_of_disposition(enum pmc_disp _pd); +const char *pmc_name_of_event(enum pmc_event _pe); +const char *pmc_name_of_mode(enum pmc_mode _pm); +const char *pmc_name_of_state(enum pmc_state _ps); + +int pmc_event_names_of_class(enum pmc_class _cl, const char ***_eventnames, + int *_nevents); +__END_DECLS + +#endif diff --git a/lib/libpmc/pmc.iaf.3 b/lib/libpmc/pmc.iaf.3 new file mode 100644 index 0000000..ec9f21c --- /dev/null +++ b/lib/libpmc/pmc.iaf.3 @@ -0,0 +1,149 @@ +.\" Copyright (c) 2008 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd November 14, 2008 +.Dt PMC.IAF 3 +.Os +.Sh NAME +.Nm pmc.iaf +.Nd measurement events for +.Tn Intel +fixed function performance counters. +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +.Tn Intel +fixed-function PMCs are present in CPUs that conform to version 2 or +later of the +.Tn Intel +Performance Measurement Architecture. +Each fixed-function PMC measures a specific hardware event. +The number of fixed-function PMCs implemented in a CPU can vary. +The number of fixed-function PMCs present can be determined at runtime +by using function +.Xr pmc_cpuinfo 3 . +.Pp +Intel fixed-function PMCs are documented in +.Rs +.%B "IA-32 Intel(R) Architecture Software Developer's Manual" +.%T "Volume 3: System Programming Guide" +.%N "Order Number 253669-027US" +.%D July 2008 +.%Q "Intel Corporation" +.Re +.Pp +.Ss PMC Capabilities +Fixed-function PMCs support the following capabilities: +.Bl -column "PMC_CAP_INTERRUPT" "Support" +.It Em Capability Ta Em Support +.It PMC_CAP_CASCADE Ta \&No +.It PMC_CAP_EDGE Ta \&No +.It PMC_CAP_INTERRUPT Ta Yes +.It PMC_CAP_INVERT Ta \&No +.It PMC_CAP_READ Ta Yes +.It PMC_CAP_PRECISE Ta \&No +.It PMC_CAP_SYSTEM Ta Yes +.It PMC_CAP_TAGGING Ta \&No +.It PMC_CAP_THRESHOLD Ta \&No +.It PMC_CAP_USER Ta Yes +.It PMC_CAP_WRITE Ta Yes +.El +.Ss Class Name Prefix +These PMCs are named using a class name prefix of +.Dq Li iaf- . +.Ss Event Qualifiers (Fixed Function PMCs) +These PMCs support the following modifiers: +.Bl -tag -width indent +.It Li os +Configure the PMC to count events occurring at ring level 0. +.It Li usr +Configure the PMC to count events occurring at ring levels 1, 2 +or 3. +.It Li anythread +.Pq Tn Atom CPUs +Configure the PMC to count events on all logical processors sharing a +processor core. +The default is to count events on the current logical processor. +.El +.Pp +If neither of the +.Dq Li os +or +.Dq Li usr +qualifiers are specified, the default is to enable both. +.Ss Event Specifiers (Fixed Function PMCs) +The fixed function PMCs are selectable using the following +event names: +.Bl -tag -width indent +.It Li INSTR_RETIRED.ANY +.Pq Fixed Function Counter 0 +The number of instructions retired. +.It Li CPU_CLK_UNHALTED.CORE +.Pq Fixed Function Counter 1 +The number of core cycles for which the core is not halted. +.It Li CPU_CLK_UNHALTED.REF +.Pq Fixed Function Counter 2 +The number of reference cycles for which the core is not halted. +.El +.Sh EXAMPLES +To measure the number of core cycles for which the core was not halted +use the event specifier +.Qq iaf-cpu-clk-unhalted.core . +.Pp +To measure the number of user instructions retired use the event specifier +.Qq iaf-instr-retired.any,usr . +.Pp +To measure the number of user instructions retired on all logical processors +in an +.Tn Atom +CPU, use the event specifier +.Qq iaf-instr-retired.any,usr,anythread . +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.core2 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmc.tsc 3 , +.Xr pmc_cpuinfo 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . diff --git a/lib/libpmc/pmc.k7.3 b/lib/libpmc/pmc.k7.3 new file mode 100644 index 0000000..2775d4f --- /dev/null +++ b/lib/libpmc/pmc.k7.3 @@ -0,0 +1,266 @@ +.\" Copyright (c) 2003-2008 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd October 4, 2008 +.Dt PMC.K7 3 +.Os +.Sh NAME +.Nm pmc.k7 +.Nd measurement events for +.Tn AMD +.Tn Athlon +(K7 family) CPUs +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +AMD K7 PMCs are present in the +.Tn "AMD Athlon" +series of CPUs and are documented in: +.Rs +.%B "AMD Athlon Processor x86 Code Optimization Guide" +.%N "Publication No. 22007" +.%D "February 2002" +.%Q "Advanced Micro Devices, Inc." +.Re +.Ss PMC Features +AMD K7 PMCs are 48 bits wide. +Each K7 CPU contains 4 PMCs with the following capabilities: +.Bl -column "PMC_CAP_INTERRUPT" "Support" +.It Em Capability Ta Em Support +.It PMC_CAP_CASCADE Ta \&No +.It PMC_CAP_EDGE Ta Yes +.It PMC_CAP_INTERRUPT Ta Yes +.It PMC_CAP_INVERT Ta Yes +.It PMC_CAP_READ Ta Yes +.It PMC_CAP_PRECISE Ta \&No +.It PMC_CAP_SYSTEM Ta Yes +.It PMC_CAP_TAGGING Ta \&No +.It PMC_CAP_THRESHOLD Ta Yes +.It PMC_CAP_USER Ta Yes +.It PMC_CAP_WRITE Ta Yes +.El +.Ss Event Qualifiers +.Pp +Event specifiers for AMD K7 PMCs can have the following optional +qualifiers: +.Bl -tag -width indent +.It Li count= Ns Ar value +Configure the counter to increment only if the number of configured +events measured in a cycle is greater than or equal to +.Ar value . +.It Li edge +Configure the counter to only count negated-to-asserted transitions +of the conditions expressed by the other qualifiers. +In other words, the counter will increment only once whenever a given +condition becomes true, irrespective of the number of clocks during +which the condition remains true. +.It Li inv +Invert the sense of comparison when the +.Dq Li count +qualifier is present, making the counter to increment when the +number of events per cycle is less than the value specified by +the +.Dq Li count +qualifier. +.It Li os +Configure the PMC to count events happening at privilege level 0. +.It Li unitmask= Ns Ar mask +This qualifier is used to further qualify a select few events, +.Dq Li k7-dc-refills-from-l2 , +.Dq Li k7-dc-refills-from-system +and +.Dq Li k7-dc-writebacks . +Here +.Ar mask +is a string of the following characters optionally separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li m +Count operations for lines in the +.Dq Modified +state. +.It Li o +Count operations for lines in the +.Dq Owner +state. +.It Li e +Count operations for lines in the +.Dq Exclusive +state. +.It Li s +Count operations for lines in the +.Dq Shared +state. +.It Li i +Count operations for lines in the +.Dq Invalid +state. +.El +.Pp +If no +.Dq Li unitmask +qualifier is specified, the default is to count events for caches +lines in any of the above states. +.It Li usr +Configure the PMC to count events occurring at privilege levels 1, 2 +or 3. +.El +.Pp +If neither of the +.Dq Li os +or +.Dq Li usr +qualifiers were specified, the default is to enable both. +.Ss AMD K7 Event Specifiers +The event specifiers supported on AMD K7 PMCs are: +.Bl -tag -width indent +.It Li k7-dc-accesses +.Pq Event 40H +Count data cache accesses. +.It Li k7-dc-misses +.Pq Event 41H +Count data cache misses. +.It Li k7-dc-refills-from-l2 Op Li ,unitmask= Ns Ar mask +.Pq Event 42H +Count data cache refills from L2 cache. +This event may be further qualified using the +.Dq Li unitmask +qualifier. +.It Li k7-dc-refills-from-system Op Li ,unitmask= Ns Ar mask +.Pq Event 43H +Count data cache refills from system memory. +This event may be further qualified using the +.Dq Li unitmask +qualifier. +.It Li k7-dc-writebacks Op Li ,unitmask= Ns Ar mask +.Pq Event 44H +Count data cache writebacks. +This event may be further qualified using the +.Dq Li unitmask +qualifier. +.It Li k7-hardware-interrupts +.Pq Event CFH +Count the number of taken hardware interrupts. +.It Li k7-ic-fetches +.Pq Event 80H +Count instruction cache fetches. +.It Li k7-ic-misses +.Pq Event 81H +Count instruction cache misses. +.It Li k7-interrupts-masked-cycles +.Pq Event CDH +Count the number of cycles when the processor's +.Va IF +flag was zero. +.It Li k7-interrupts-masked-while-pending-cycles +.Pq Event CEH +Count the number of cycles interrupts were masked while pending due +to the processor's +.Va IF +flag being zero. +.It Li k7-l1-and-l2-dtlb-misses +.Pq Event 46H +Count L1 and L2 DTLB misses. +.It Li k7-l1-dtlb-miss-and-l2-dtlb-hits +.Pq Event 45H +Count L1 DTLB misses and L2 DTLB hits. +.It Li k7-l1-itlb-misses +.Pq Event 84H +Count L1 ITLB misses that are L2 ITLB hits. +.It Li k7-l1-l2-itlb-misses +.Pq Event 85H +Count L1 (and L2) ITLB misses. +.It Li k7-misaligned-references +.Pq Event 47H +Count misaligned data references. +.It Li k7-retired-branches +.Pq Event C2H +Count all retired branches (conditional, unconditional, exceptions +and interrupts). +.It Li k7-retired-branches-mispredicted +.Pq Event C3H +Count all mispredicted retired branches. +.It Li k7-retired-far-control-transfers +.Pq Event C6H +Count retired far control transfers. +.It Li k7-retired-instructions +.Pq Event C0H +Count all retired instructions. +.It Li k7-retired-ops +.Pq Event C1H +Count retired ops. +.It Li k7-retired-resync-branches +.Pq Event C7H +Count retired resync branches (non control transfer branches). +.It Li k7-retired-taken-branches +.Pq Event C4H +Count retired taken branches. +.It Li k7-retired-taken-branches-mispredicted +.Pq Event C5H +Count mispredicted taken branches that were retired. +.El +.Ss Event Name Aliases +The following table shows the mapping between the PMC-independent +aliases supported by +.Lb libpmc +and the underlying hardware events used. +.Bl -column "branch-mispredicts" "Description" +.It Em Alias Ta Em Event +.It Li branches Ta Li k7-retired-branches +.It Li branch-mispredicts Ta Li k7-retired-branches-mispredicted +.It Li dc-misses Ta Li k7-dc-misses +.It Li ic-misses Ta Li k7-ic-misses +.It Li instructions Ta Li k7-retired-instructions +.It Li interrupts Ta Li k7-hardware-interrupts +.It Li unhalted-cycles Ta (unsupported) +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.core2 3 , +.Xr pmc.iaf 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmc.tsc 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . diff --git a/lib/libpmc/pmc.k8.3 b/lib/libpmc/pmc.k8.3 new file mode 100644 index 0000000..995bfac --- /dev/null +++ b/lib/libpmc/pmc.k8.3 @@ -0,0 +1,800 @@ +.\" Copyright (c) 2003-2008 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd October 4, 2008 +.Dt PMC.K8 3 +.Os +.Sh NAME +.Nm pmc.k8 +.Nd measurement events for +.Tn AMD +.Tn Athlon 64 +(K8 family) CPUs +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +AMD K8 PMCs are present in the +.Tn "AMD Athlon64" +and +.Tn "AMD Opteron" +series of CPUs. +They are documented in the +.Rs +.%B "BIOS and Kernel Developer's Guide for the AMD Athlon(tm) 64 and AMD Opteron Processors" +.%N "Publication No. 26094" +.%D "April 2004" +.%Q "Advanced Micro Devices, Inc." +.Re +.Ss PMC Features +AMD K8 PMCs are 48 bits wide. +Each CPU contains 4 PMCs with the following capabilities: +.Bl -column "PMC_CAP_INTERRUPT" "Support" +.It Em Capability Ta Em Support +.It PMC_CAP_CASCADE Ta \&No +.It PMC_CAP_EDGE Ta Yes +.It PMC_CAP_INTERRUPT Ta Yes +.It PMC_CAP_INVERT Ta Yes +.It PMC_CAP_READ Ta Yes +.It PMC_CAP_PRECISE Ta \&No +.It PMC_CAP_SYSTEM Ta Yes +.It PMC_CAP_TAGGING Ta \&No +.It PMC_CAP_THRESHOLD Ta Yes +.It PMC_CAP_USER Ta Yes +.It PMC_CAP_WRITE Ta Yes +.El +.Ss Event Qualifiers +.Pp +Event specifiers for AMD K8 PMCs can have the following optional +qualifiers: +.Bl -tag -width indent +.It Li count= Ns Ar value +Configure the counter to increment only if the number of configured +events measured in a cycle is greater than or equal to +.Ar value . +.It Li edge +Configure the counter to only count negated-to-asserted transitions +of the conditions expressed by the other fields. +In other words, the counter will increment only once whenever a given +condition becomes true, irrespective of the number of clocks during +which the condition remains true. +.It Li inv +Invert the sense of comparison when the +.Dq Li count +qualifier is present, making the counter to increment when the +number of events per cycle is less than the value specified by +the +.Dq Li count +qualifier. +.It Li mask= Ns Ar qualifier +Many event specifiers for AMD K8 PMCs need to be additionally +qualified using a mask qualifier. +These additional qualifiers are event-specific and are documented +along with their associated event specifiers below. +.It Li os +Configure the PMC to count events happening at privilege level 0. +.It Li usr +Configure the PMC to count events occurring at privilege levels 1, 2 +or 3. +.El +.Pp +If neither of the +.Dq Li os +or +.Dq Li usr +qualifiers were specified, the default is to enable both. +.Ss AMD K8 Event Specifiers +The event specifiers supported on AMD K8 PMCs are: +.Bl -tag -width indent +.It Li k8-bu-cpu-clk-unhalted +.Pq Event 76H +Count the number of clock cycles when the CPU is not in the HLT or +STPCLK states. +.It Li k8-bu-fill-request-l2-miss Op Li ,mask= Ns Ar qualifier +.Pq Event 7EH +Count fill requests that missed in the L2 cache. +This event may be further qualified using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li dc-fill +Count data cache fill requests. +.It Li ic-fill +Count instruction cache fill requests. +.It Li tlb-reload +Count TLB reloads. +.El +.Pp +The default is to count all types of requests. +.It Li k8-bu-fill-into-l2 Op Li ,mask= Ns Ar qualifier +.Pq Event 7FH +The number of lines written to and from the L2 cache. +The event may be further qualified by using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li dirty-l2-victim +Count lines written into L2 cache due to victim writebacks from the +Icache or Dcache, TLB page table walks or hardware data prefetches. +.It Li victim-from-l2 +Count writebacks of dirty lines from L2 to the system. +.El +.It Li k8-bu-internal-l2-request Op Li ,mask= Ns Ar qualifier +.Pq Event 7DH +Count internally generated requests to the L2 cache. +This event may be further qualified using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li cancelled +Count cancelled requests. +.It Li dc-fill +Count data cache fill requests. +.It Li ic-fill +Count instruction cache fill requests. +.It Li tag-snoop +Count tag snoop requests. +.It Li tlb-reload +Count TLB reloads. +.El +.Pp +The default is to count all types of requests. +.It Li k8-dc-access +.Pq Event 40H +Count data cache accesses including microcode scratch pad accesses. +.It Li k8-dc-copyback Op Li ,mask= Ns Ar qualifier +.Pq Event 44H +Count data cache copyback operations. +This event may be further qualified using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li exclusive +Count operations for lines in the +.Dq exclusive +state. +.It Li invalid +Count operations for lines in the +.Dq invalid +state. +.It Li modified +Count operations for lines in the +.Dq modified +state. +.It Li owner +Count operations for lines in the +.Dq owner +state. +.It Li shared +Count operations for lines in the +.Dq shared +state. +.El +.Pp +The default is to count operations for lines in all the +above states. +.It Li k8-dc-dcache-accesses-by-locks Op Li ,mask= Ns Ar qualifier +.Pq Event 4CH +Count data cache accesses by lock instructions. +This event is only available on processors of revision C or later +vintage. +This event may be further qualified using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li accesses +Count data cache accesses by lock instructions. +.It Li misses +Count data cache misses by lock instructions. +.El +.Pp +The default is to count all accesses. +.It Li k8-dc-dispatched-prefetch-instructions Op Li ,mask= Ns Ar qualifier +.Pq Event 4BH +Count the number of dispatched prefetch instructions. +This event may be further qualified using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li load +Count load operations. +.It Li nta +Count non-temporal operations. +.It Li store +Count store operations. +.El +.Pp +The default is to count all operations. +.It Li k8-dc-l1-dtlb-miss-and-l2-dtlb-hit +.Pq Event 45H +Count L1 DTLB misses that are L2 DTLB hits. +.It Li k8-dc-l1-dtlb-miss-and-l2-dtlb-miss +.Pq Event 46H +Count L1 DTLB misses that are also misses in the L2 DTLB. +.It Li k8-dc-microarchitectural-early-cancel-of-an-access +.Pq Event 49H +Count microarchitectural early cancels of data cache accesses. +.It Li k8-dc-microarchitectural-late-cancel-of-an-access +.Pq Event 48H +Count microarchitectural late cancels of data cache accesses. +.It Li k8-dc-misaligned-data-reference +.Pq Event 47H +Count misaligned data references. +.It Li k8-dc-miss +.Pq Event 41H +Count data cache misses. +.It Li k8-dc-one-bit-ecc-error Op Li ,mask= Ns Ar qualifier +.Pq Event 4AH +Count one bit ECC errors found by the scrubber. +This event may be further qualified using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li scrubber +Count scrubber detected errors. +.It Li piggyback +Count piggyback scrubber errors. +.El +.Pp +The default is to count both kinds of errors. +.It Li k8-dc-refill-from-l2 Op Li ,mask= Ns Ar qualifier +.Pq Event 42H +Count data cache refills from L2 cache. +This event may be further qualified using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li exclusive +Count operations for lines in the +.Dq exclusive +state. +.It Li invalid +Count operations for lines in the +.Dq invalid +state. +.It Li modified +Count operations for lines in the +.Dq modified +state. +.It Li owner +Count operations for lines in the +.Dq owner +state. +.It Li shared +Count operations for lines in the +.Dq shared +state. +.El +.Pp +The default is to count operations for lines in all the +above states. +.It Li k8-dc-refill-from-system Op Li ,mask= Ns Ar qualifier +.Pq Event 43H +Count data cache refills from system memory. +This event may be further qualified using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li exclusive +Count operations for lines in the +.Dq exclusive +state. +.It Li invalid +Count operations for lines in the +.Dq invalid +state. +.It Li modified +Count operations for lines in the +.Dq modified +state. +.It Li owner +Count operations for lines in the +.Dq owner +state. +.It Li shared +Count operations for lines in the +.Dq shared +state. +.El +.Pp +The default is to count operations for lines in all the +above states. +.It Li k8-fp-cycles-with-no-fpu-ops-retired +.Pq Event 01H +Count cycles when no FPU ops were retired. +This event is supported in revision B and later CPUs. +.It Li k8-fp-dispatched-fpu-fast-flag-ops +.Pq Event 02H +Count dispatched FPU ops that use the fast flag interface. +This event is supported in revision B and later CPUs. +.It Li k8-fp-dispatched-fpu-ops Op Li ,mask= Ns Ar qualifier +.Pq Event 00H +Count the number of dispatched FPU ops. +This event is supported in revision B and later CPUs. +This event may be further qualified using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li add-pipe-excluding-junk-ops +Count add pipe ops excluding junk ops. +.It Li add-pipe-junk-ops +Count junk ops in the add pipe. +.It Li multiply-pipe-excluding-junk-ops +Count multiply pipe ops excluding junk ops. +.It Li multiply-pipe-junk-ops +Count junk ops in the multiply pipe. +.It Li store-pipe-excluding-junk-ops +Count store pipe ops excluding junk ops +.It Li store-pipe-junk-ops +Count junk ops in the store pipe. +.El +.Pp +The default is to count all types of ops. +.It Li k8-fr-decoder-empty +.Pq Event D0H +Count cycles when there was nothing to dispatch (i.e., the decoder +was empty). +.It Li k8-fr-dispatch-stall-for-segment-load +.Pq Event D4H +Count dispatch stalls for segment loads. +.It Li k8-fr-dispatch-stall-for-serialization +.Pq Event D3H +Count dispatch stalls for serialization. +.It Li k8-fr-dispatch-stall-from-branch-abort-to-retire +.Pq Event D2H +Count dispatch stalls from branch abort to retiral. +.It Li k8-fr-dispatch-stall-when-fpu-is-full +.Pq Event D7H +Count dispatch stalls when the FPU is full. +.It Li k8-fr-dispatch-stall-when-ls-is-full +.Pq Event D8H +Count dispatch stalls when the load/store unit is full. +.It Li k8-fr-dispatch-stall-when-reorder-buffer-is-full +.Pq Event D5H +Count dispatch stalls when the reorder buffer is full. +.It Li k8-fr-dispatch-stall-when-reservation-stations-are-full +.Pq Event D6H +Count dispatch stalls when reservation stations are full. +.It Li k8-fr-dispatch-stall-when-waiting-far-xfer-or-resync-branch-pending +.Pq Event DAH +Count dispatch stalls when a far control transfer or a resync branch +is pending. +.It Li k8-fr-dispatch-stall-when-waiting-for-all-to-be-quiet +.Pq Event D9H +Count dispatch stalls when waiting for all to be quiet. +.\" XXX What does "waiting for all to be quiet" mean? +.It Li k8-fr-dispatch-stalls +.Pq Event D1H +Count all dispatch stalls. +.It Li k8-fr-fpu-exceptions Op Li ,mask= Ns Ar qualifier +.Pq Event DBH +Count FPU exceptions. +This event is supported in revision B and later CPUs. +This event may be further qualified using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li sse-and-x87-microtraps +Count SSE and x87 microtraps. +.It Li sse-reclass-microfaults +Count SSE reclass microfaults +.It Li sse-retype-microfaults +Count SSE retype microfaults +.It Li x87-reclass-microfaults +Count x87 reclass microfaults. +.El +.Pp +The default is to count all types of exceptions. +.It Li k8-fr-interrupts-masked-cycles +.Pq Event CDH +Count cycles when interrupts were masked (by CPU RFLAGS field IF was zero). +.It Li k8-fr-interrupts-masked-while-pending-cycles +.Pq Event CEH +Count cycles while interrupts were masked while pending (i.e., cycles +when INTR was asserted while CPU RFLAGS field IF was zero). +.It Li k8-fr-number-of-breakpoints-for-dr0 +.Pq Event DCH +Count the number of breakpoints for DR0. +.It Li k8-fr-number-of-breakpoints-for-dr1 +.Pq Event DDH +Count the number of breakpoints for DR1. +.It Li k8-fr-number-of-breakpoints-for-dr2 +.Pq Event DEH +Count the number of breakpoints for DR2. +.It Li k8-fr-number-of-breakpoints-for-dr3 +.Pq Event DFH +Count the number of breakpoints for DR3. +.It Li k8-fr-retired-branches +.Pq Event C2H +Count retired branches including exceptions and interrupts. +.It Li k8-fr-retired-branches-mispredicted +.Pq Event C3H +Count mispredicted retired branches. +.It Li k8-fr-retired-far-control-transfers +.Pq Event C6H +Count retired far control transfers (which are always mispredicted). +.It Li k8-fr-retired-fastpath-double-op-instructions Op Li ,mask= Ns Ar qualifier +.Pq Event CCH +Count retired fastpath double op instructions. +This event is supported in revision B and later CPUs. +This event may be further qualified using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li low-op-pos-0 +Count instructions with the low op in position 0. +.It Li low-op-pos-1 +Count instructions with the low op in position 1. +.It Li low-op-pos-2 +Count instructions with the low op in position 2. +.El +.Pp +The default is to count all types of instructions. +.It Li k8-fr-retired-fpu-instructions Op Li ,mask= Ns Ar qualifier +.Pq Event CBH +Count retired FPU instructions. +This event is supported in revision B and later CPUs. +This event may be further qualified using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li mmx-3dnow +Count MMX and 3DNow!\& instructions. +.It Li packed-sse-sse2 +Count packed SSE and SSE2 instructions. +.It Li scalar-sse-sse2 +Count scalar SSE and SSE2 instructions +.It Li x87 +Count x87 instructions. +.El +.Pp +The default is to count all types of instructions. +.It Li k8-fr-retired-near-returns +.Pq Event C8H +Count retired near returns. +.It Li k8-fr-retired-near-returns-mispredicted +.Pq Event C9H +Count mispredicted near returns. +.It Li k8-fr-retired-resyncs +.Pq Event C7H +Count retired resyncs (non-control transfer branches). +.It Li k8-fr-retired-taken-branches +.Pq Event C4H +Count retired taken branches. +.It Li k8-fr-retired-taken-branches-mispredicted +.Pq Event C5H +Count retired taken branches that were mispredicted. +.It Li k8-fr-retired-taken-branches-mispredicted-by-addr-miscompare +.Pq Event CAH +Count retired taken branches that were mispredicted only due to an +address miscompare. +.It Li k8-fr-retired-taken-hardware-interrupts +.Pq Event CFH +Count retired taken hardware interrupts. +.It Li k8-fr-retired-uops +.Pq Event C1H +Count retired uops. +.It Li k8-fr-retired-x86-instructions +.Pq Event C0H +Count retired x86 instructions including exceptions and interrupts. +.It Li k8-ic-fetch +.Pq Event 80H +Count instruction cache fetches. +.It Li k8-ic-instruction-fetch-stall +.Pq Event 87H +Count cycles in stalls due to instruction fetch. +.It Li k8-ic-l1-itlb-miss-and-l2-itlb-hit +.Pq Event 84H +Count L1 ITLB misses that are L2 ITLB hits. +.It Li k8-ic-l1-itlb-miss-and-l2-itlb-miss +.Pq Event 85H +Count ITLB misses that miss in both L1 and L2 ITLBs. +.It Li k8-ic-microarchitectural-resync-by-snoop +.Pq Event 86H +Count microarchitectural resyncs caused by snoops. +.It Li k8-ic-miss +.Pq Event 81H +Count instruction cache misses. +.It Li k8-ic-refill-from-l2 +.Pq Event 82H +Count instruction cache refills from L2 cache. +.It Li k8-ic-refill-from-system +.Pq Event 83H +Count instruction cache refills from system memory. +.It Li k8-ic-return-stack-hits +.Pq Event 88H +Count hits to the return stack. +.It Li k8-ic-return-stack-overflow +.Pq Event 89H +Count overflows of the return stack. +.It Li k8-ls-buffer2-full +.Pq Event 23H +Count load/store buffer2 full events. +.It Li k8-ls-locked-operation Op Li ,mask= Ns Ar qualifier +.Pq Event 24H +Count locked operations. +For revision C and later CPUs, the following qualifiers are supported: +.Pp +.Bl -tag -width indent -compact +.It Li cycles-in-request +Count the number of cycles in the lock request/grant stage. +.It Li cycles-to-complete +Count the number of cycles a lock takes to complete once it is +non-speculative and is the older load/store operation. +.It Li locked-instructions +Count the number of lock instructions executed. +.El +.Pp +The default is to count the number of lock instructions executed. +.It Li k8-ls-microarchitectural-late-cancel +.Pq Event 25H +Count microarchitectural late cancels of operations in the load/store +unit. +.It Li k8-ls-microarchitectural-resync-by-self-modifying-code +.Pq Event 21H +Count microarchitectural resyncs caused by self-modifying code. +.It Li k8-ls-microarchitectural-resync-by-snoop +.Pq Event 22H +Count microarchitectural resyncs caused by snoops. +.It Li k8-ls-retired-cflush-instructions +.Pq Event 26H +Count retired CFLUSH instructions. +.It Li k8-ls-retired-cpuid-instructions +.Pq Event 27H +Count retired CPUID instructions. +.It Li k8-ls-segment-register-load Op Li ,mask= Ns Ar qualifier +.Pq Event 20H +Count segment register loads. +This event may be further qualified using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Bl -tag -width indent -compact +.It Li cs +Count CS register loads. +.It Li ds +Count DS register loads. +.It Li es +Count ES register loads. +.It Li fs +Count FS register loads. +.It Li gs +Count GS register loads. +.\" .It Li hs +.\" Count HS register loads. +.\" XXX "HS" register? +.It Li ss +Count SS register loads. +.El +.Pp +The default is to count all types of loads. +.It Li k8-nb-ht-bus0-bandwidth Op Li ,mask= Ns Ar qualifier +.It Li k8-nb-ht-bus1-bandwidth Op Li ,mask= Ns Ar qualifier +.It Li k8-nb-ht-bus2-bandwidth Op Li ,mask= Ns Ar qualifier +.Pq Events F6H, F7H and F8H respectively +Count events on the HyperTransport(tm) buses. +These events may be further qualified using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li buffer-release +Count buffer release messages sent. +.It Li command +Count command messages sent. +.It Li data +Count data messages sent. +.It Li nop +Count nop messages sent. +.El +.Pp +The default is to count all types of messages. +.It Li k8-nb-memory-controller-bypass-saturation Op Li ,mask= Ns Ar qualifier +.Pq Event E4H +Count memory controller bypass counter saturation events. +This event may be further qualified using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li dram-controller-interface-bypass +Count DRAM controller interface bypass. +.It Li dram-controller-queue-bypass +Count DRAM controller queue bypass. +.It Li memory-controller-hi-pri-bypass +Count memory controller high priority bypasses. +.It Li memory-controller-lo-pri-bypass +Count memory controller low priority bypasses. +.El +.Pp +.It Li k8-nb-memory-controller-dram-slots-missed +.Pq Event E2H +Count memory controller DRAM command slots missed (in MemClks). +.It Li k8-nb-memory-controller-page-access-event Op Li ,mask= Ns Ar qualifier +.Pq Event E0H +Count memory controller page access events. +This event may be further qualified using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li page-conflict +Count page conflicts. +.It Li page-hit +Count page hits. +.It Li page-miss +Count page misses. +.El +.Pp +The default is to count all types of events. +.It Li k8-nb-memory-controller-page-table-overflow +.Pq Event E1H +Count memory control page table overflow events. +.It Li k8-nb-memory-controller-turnaround Op Li ,mask= Ns Ar qualifier +.Pq Event E3H +Count memory control turnaround events. +This event may be further qualified using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Pp +.Bl -tag -width indent -compact +.\" XXX doc is unclear whether these are cycle counts or event counts +.It Li dimm-turnaround +Count DIMM turnarounds. +.It Li read-to-write-turnaround +Count read to write turnarounds. +.It Li write-to-read-turnaround +Count write to read turnarounds. +.El +.Pp +The default is to count all types of events. +.It Li k8-nb-probe-result Op Li ,mask= Ns Ar qualifier +.Pq Event ECH +Count probe events. +This event may be further qualified using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li probe-hit +Count all probe hits. +.It Li probe-hit-dirty-no-memory-cancel +Count probe hits without memory cancels. +.It Li probe-hit-dirty-with-memory-cancel +Count probe hits with memory cancels. +.It Li probe-miss +Count probe misses. +.El +.It Li k8-nb-sized-commands Op Li ,mask= Ns Ar qualifier +.Pq Event EBH +Count sized commands issued. +This event may be further qualified using +.Ar qualifier , +which is a +.Ql + +separated set of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li nonpostwrszbyte +.It Li nonpostwrszdword +.It Li postwrszbyte +.It Li postwrszdword +.It Li rdszbyte +.It Li rdszdword +.It Li rdmodwr +.El +.Pp +The default is to count all types of commands. +.El +.Ss Event Name Aliases +The following table shows the mapping between the PMC-independent +aliases supported by +.Lb libpmc +and the underlying hardware events used. +.Bl -column "branch-mispredicts" "Description" +.It Em Alias Ta Em Event +.It Li branches Ta Li k8-fr-retired-taken-branches +.It Li branch-mispredicts Ta Li k8-fr-retired-taken-branches-mispredicted +.It Li dc-misses Ta Li k8-dc-miss +.It Li ic-misses Ta Li k8-ic-miss +.It Li instructions Ta Li k8-fr-retired-x86-instructions +.It Li interrupts Ta Li k8-fr-taken-hardware-interrupts +.It Li unhalted-cycles Ta Li k8-bu-cpu-clk-unhalted +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.core2 3 , +.Xr pmc.iaf 3 , +.Xr pmc.k7 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmc.tsc 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . diff --git a/lib/libpmc/pmc.mips.3 b/lib/libpmc/pmc.mips.3 new file mode 100644 index 0000000..194838e --- /dev/null +++ b/lib/libpmc/pmc.mips.3 @@ -0,0 +1,410 @@ +.\" Copyright (c) 2010 George Neville-Neil. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall George Neville-Neil be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd February 11, 2010 +.Os +.Dt PMC.MIPS 3 +.Sh NAME +.Nm pmc.mips +.Nd measurement events for +.Tn MIPS +family CPUs +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +MIPS PMCs are present in MIPS +.Tn "24k" +and other processors in the MIPS family. +.Pp +There are two counters supported by the hardware and each is 32 bits +wide. +.Pp +MIPS PMCs are documented in +.Rs +.%B "MIPS32 24K Processor Core Family Software User's Manual" +.%D December 2008 +.%Q "MIPS Technologies Inc." +.Re +.Ss Event Specifiers (Programmable PMCs) +MIPS programmable PMCs support the following events: +.Bl -tag -width indent +.It Li CYCLE +.Pq Event 0, Counter 0/1 +Total number of cycles. +The performance counters are clocked by the +top-level gated clock. +If the core is built with that clock gater +present, none of the counters will increment while the clock is +stopped - due to a WAIT instruction. +.It Li INSTR_EXECUTED +.Pq Event 1, Counter 0/1 +Total number of instructions completed. +.It Li BRANCH_COMPLETED +.Pq Event 2, Counter 0 +Total number of branch instructions completed. +.It Li BRANCH_MISPRED +.Pq Event 2, Counter 1 +Counts all branch instructions which completed, but were mispredicted. +.It Li RETURN +.Pq Event 3, Counter 0 +Counts all JR R31 instructions completed. +.It Li RETURN_MISPRED +.Pq Event 3, Counter 1 +Counts all JR $31 instructions which completed, used the RPS for a prediction, but were mispredicted. +.It Li RETURN_NOT_31 +.Pq Event 4, Counter 0 +Counts all JR $xx (not $31) and JALR instructions (indirect jumps). +.It Li RETURN_NOTPRED +.Pq Event 4, Counter 1 +If RPS use is disabled, JR $31 will not be predicted. +.It Li ITLB_ACCESS +.Pq Event 5, Counter 0 +Counts ITLB accesses that are due to fetches showing up in the +instruction fetch stage of the pipeline and which do not use a fixed +mapping or are not in unmapped space. +If an address is fetched twice from the pipe (as in the case of a +cache miss), that instruction willcount as 2 ITLB accesses. +Since each fetch gets us 2 instructions,there is one access marked per double +word. +.It Li ITLB_MISS +.Pq Event 5, Counter 1 +Counts all misses in the ITLB except ones that are on the back of another +miss. +We cannot process back to back misses and thus those are +ignored. +They are also ignored if there is some form of address error. +.It Li DTLB_ACCESS +.Pq Event 6, Counter 0 +Counts DTLB access including those in unmapped address spaces. +.It Li DTLB_MISS +.Pq Event 6, Counter 1 +Counts DTLB misses. Back to back misses that result in only one DTLB +entry getting refilled are counted as a single miss. +.It Li JTLB_IACCESS +.Pq Event 7, Counter 0 +Instruction JTLB accesses are counted exactly the same as ITLB misses. +.It Li JTLB_IMISS +.Pq Event 7, Counter 1 +Counts instruction JTLB accesses that result in no match or a match on +an invalid translation. +.It Li JTLB_DACCESS +.Pq Event 8, Counter 0 +Data JTLB accesses. +.It Li JTLB_DMISS +.Pq Event 8, Counter 1 +Counts data JTLB accesses that result in no match or a match on an invalid translation. +.It Li IC_FETCH +.Pq Event 9, Counter 0 +Counts every time the instruction cache is accessed. All replays, +wasted fetches etc. are counted. +For example, following a branch, even though the prediction is taken, +the fall through access is counted. + +.It Li IC_MISS +.Pq Event 9, Counter 1 +Counts all instruction cache misses that result in a bus request. +.It Li DC_LOADSTORE +.Pq Event 10, Counter 0 +Counts cached loads and stores. +.It Li DC_WRITEBACK +.Pq Event 10, Counter 1 +Counts cache lines written back to memory due to replacement or cacheops. +.It Li DC_MISS +.Pq Event 11, Counter 0/1 +Counts loads and stores that miss in the cache +.It Li LOAD_MISS +.Pq Event 13, Counter 0 +Counts number of cacheable loads that miss in the cache. +.It Li STORE_MISS +.Pq Event 13, Counter 1 +Counts number of cacheable stores that miss in the cache. +.It Li INTEGER_COMPLETED +.Pq Event 14, Counter 0 +Non-floating point, non-Coprocessor 2 instructions. +.It Li FP_COMPLETED +.Pq Event 14, Counter 1 +Floating point instructions completed. +.It Li LOAD_COMPLETED +.Pq Event 15, Counter 0 +Integer and co-processor loads completed. +.It Li STORE_COMPLETED +.Pq Event 15, Counter 1 +Integer and co-processor stores completed. +.It Li BARRIER_COMPLETED +.Pq Event 16, Counter 0 +Direct jump (and link) instructions completed. +.It Li MIPS16_COMPLETED +.Pq Event 16, Counter 1 +MIPS16c instructions completed. +.It Li NOP_COMPLETED +.Pq Event 17, Counter 0 +NOPs completed. +This includes all instructions that normally write to a general +purpose register, but where the destination register was set to r0. +.It Li INTEGER_MULDIV_COMPLETED +.Pq Event 17, Counter 1 +Integer multipy and divide instructions completed. (MULxx, DIVx, MADDx, MSUBx). +.It Li RF_STALL +.Pq Event 18, Counter 0 +Counts the total number of cycles where no instructions are issued +from the IFU to ALU (the RF stage does not advance) which includes +both of the previous two events. +The RT_STALL is different than the sum of them though because cycles +when both stalls are active will only be counted once. +.It Li INSTR_REFETCH +.Pq Event 18, Counter 1 +replay traps (other than uTLB) +.It Li STORE_COND_COMPLETED +.Pq Event 19, Counter 0 +Conditional stores completed. Counts all events, including failed stores. +.It Li STORE_COND_FAILED +.Pq Event 19, Counter 1 +Conditional store instruction that did not update memory. +Note: While this event and the SC instruction count event can be configured to +count in specific operating modes, the timing of the events is much +different and the observed operating mode could change between them, +causing some inaccuracy in the measured ratio. +.It Li ICACHE_REQUESTS +.Pq Event 20, Counter 0 +Note that this only counts PREFs that are actually attempted. +PREFs to uncached addresses or ones with translation errors are not counted +.It Li ICACHE_HIT +.Pq Event 20, Counter 1 +Counts PREF instructions that hit in the cache +.It Li L2_WRITEBACK +.Pq Event 21, Counter 0 +Counts cache lines written back to memory due to replacement or cacheops. +.It Li L2_ACCESS +.Pq Event 21, Counter 1 +Number of accesses to L2 Cache. +.It Li L2_MISS +.Pq Event 22, Counter 0 +Number of accesses that missed in the L2 cache. +.It Li L2_ERR_CORRECTED +.Pq Event 22, Counter 1 +Single bit errors in L2 Cache that were detected and corrected. +.It Li EXCEPTIONS +.Pq Event 23, Counter 0 +Any type of exception taken. +.It Li RF_CYCLES_STALLED +.Pq Event 24, Counter 0 +Counts cycles where the LSU is in fixup and cannot accept a new +instruction from the ALU. +Fixups are replays within the LSU that occur when an instruction needs +to re-access the cache or the DTLB. +.It Li IFU_CYCLES_STALLED +.Pq Event 25, Counter 0 +Counts the number of cycles where the fetch unit is not providing a +valid instruction to the ALU. +.It Li ALU_CYCLES_STALLED +.Pq Event 25, Counter 1 +Counts the number of cycles where the ALU pipeline cannot advance. +.It Li UNCACHED_LOAD +.Pq Event 33, Counter 0 +Counts uncached and uncached accelerated loads. +.It Li UNCACHED_STORE +.Pq Event 33, Counter 1 +Counts uncached and uncached accelerated stores. +.It Li CP2_REG_TO_REG_COMPLETED +.Pq Event 35, Counter 0 +Co-processor 2 register to register instructions completed. +.It Li MFTC_COMPLETED +.Pq Event 35, Counter 1 +Co-processor 2 move to and from instructions as well as loads and stores. +.It Li IC_BLOCKED_CYCLES +.Pq Event 37, Counter 0 +Cycles when IFU stalls because an instruction miss caused the IFU not +to have any runnable instructions. +Ignores the stalls due to ITLB misses as well as the 4 cycles +following a redirect. +.It Li DC_BLOCKED_CYCLES +.Pq Event 37, Counter 1 +Counts all cycles where integer pipeline waits on Load return data due +to a D-cache miss. +The LSU can signal a "long stall" on a D-cache misses, in which case +the waiting TC might be rescheduled so other TCs can execute +instructions till the data returns. +.It Li L2_IMISS_STALL_CYCLES +.Pq Event 38, Counter 0 +Cycles where the main pipeline is stalled waiting for a SYNC to complete. +.It Li L2_DMISS_STALL_CYCLES +.Pq Event 38, Counter 1 +Cycles where the main pipeline is stalled because of an index conflict +in the Fill Store Buffer. +.It Li DMISS_CYCLES +.Pq Event 39, Counter 0 +Data miss is outstanding, but not necessarily stalling the pipeline. +The difference between this and D$ miss stall cycles can show the gain +from non-blocking cache misses. +.It Li L2_MISS_CYCLES +.Pq Event 39, Counter 1 +L2 miss is outstanding, but not necessarily stalling the pipeline. +.It Li UNCACHED_BLOCK_CYCLES +.Pq Event 40, Counter 0 +Cycles where the processor is stalled on an uncached fetch, load, or store. +.It Li MDU_STALL_CYCLES +.Pq Event 41, Counter 0 +Cycles where the processor is stalled on an uncached fetch, load, or store. +.It Li FPU_STALL_CYCLES +.Pq Event 41, Counter 1 +Counts all cycles where integer pipeline waits on FPU return data. +.It Li CP2_STALL_CYCLES +.Pq Event 42, Counter 0 +Counts all cycles where integer pipeline waits on CP2 return data. +.It Li COREXTEND_STALL_CYCLES +.Pq Event 42, Counter 1 +Counts all cycles where integer pipeline waits on CorExtend return data. +.It Li ISPRAM_STALL_CYCLES +.Pq Event 43, Counter 0 +Count all pipeline bubbles that are a result of multicycle ISPRAM +access. +Pipeline bubbles are defined as all cycles that IFU doesn't present an +instruction to ALU. The four cycles after a redirect are not counted. +.It Li DSPRAM_STALL_CYCLES +.Pq Event 43, Counter 1 +Counts stall cycles created by an instruction waiting for access to DSPRAM. +.It Li CACHE_STALL_CYCLES +.Pq Event 44, Counter 0 +Counts all cycles the where pipeline is stalled due to CACHE +instructions. +Includes cycles where CACHE instructions themselves are +stalled in the ALU, and cycles where CACHE instructions cause +subsequent instructions to be stalled. +.It Li LOAD_TO_USE_STALLS +.Pq Event 45, Counter 0 +Counts all cycles where integer pipeline waits on Load return data. +.It Li BASE_MISPRED_STALLS +.Pq Event 45, Counter 1 +Counts stall cycles due to skewed ALU where the bypass to the address +generation takes an extra cycle. +.It Li CPO_READ_STALLS +.Pq Event 46, Counter 0 +Counts all cycles where integer pipeline waits on return data from +MFC0, RDHWR instructions. +.It Li BRANCH_MISPRED_CYCLES +.Pq Event 46, Counter 1 +This counts the number of cycles from a mispredicted branch until the +next non-delay slot instruction executes. +.It Li IFETCH_BUFFER_FULL +.Pq Event 48, Counter 0 +Counts the number of times an instruction cache miss was detected, but +both fill buffers were already allocated. +.It Li FETCH_BUFFER_ALLOCATED +.Pq Event 48, Counter 1 +Number of cycles where at least one of the IFU fill buffers is +allocated (miss pending). +.It Li EJTAG_ITRIGGER +.Pq Event 49, Counter 0 +Number of times an EJTAG Instruction Trigger Point condition matched. +.It Li EJTAG_DTRIGGER +.Pq Event 49, Counter 1 +Number of times an EJTAG Data Trigger Point condition matched. +.It Li FSB_LT_QUARTER +.Pq Event 50, Counter 0 +Fill store buffer less than one quarter full. +.It Li FSB_QUARTER_TO_HALF +.Pq Event 50, Counter 1 +Fill store buffer between one quarter and one half full. +.It Li FSB_GT_HALF +.Pq Event 51, Counter 0 +Fill store buffer more than half full. +.It Li FSB_FULL_PIPELINE_STALLS +.Pq Event 51, Counter 1 +Cycles where the pipeline is stalled because the Fill-Store Buffer in LSU is full. +.It Li LDQ_LT_QUARTER +.Pq Event 52, Counter 0 +Load data queue less than one quarter full. +.It Li LDQ_QUARTER_TO_HALF +.Pq Event 52, Counter 1 +Load data queue between one quarter and one half full. +.It Li LDQ_GT_HALF +.Pq Event 53, Counter 0 +Load data queue more than one half full. +.It Li LDQ_FULL_PIPELINE_STALLS +.Pq Event 53, Counter 1 +Cycles where the pipeline is stalled because the Load Data Queue in the LSU is full. +.It Li WBB_LT_QUARTER +.Pq Event 54, Counter 0 +Write back buffer less than one quarter full. +.It Li WBB_QUARTER_TO_HALF +.Pq Event 54, Counter 1 +Write back buffer between one quarter and one half full. +.It Li WBB_GT_HALF +.Pq Event 55, Counter 0 +Write back buffer more than one half full. +.It Li WBB_FULL_PIPELINE_STALLS +.Pq Event 55 Counter 1 +Cycles where the pipeline is stalled because the Load Data Queue in the LSU is full. +.It Li REQUEST_LATENCY +.Pq Event 61, Counter 0 +Measures latency from miss detection until critical dword of response +is returned, Only counts for cacheable reads. +.It Li REQUEST_COUNT +.Pq Event 61, Counter 1 +Counts number of cacheable read requests used for previous latency counter. +.El +.Ss Event Name Aliases +The following table shows the mapping between the PMC-independent +aliases supported by +.Lb libpmc +and the underlying hardware events used. +.Bl -column "branch-mispredicts" "cpu_clk_unhalted.core_p" +.It Em Alias Ta Em Event Ta +.It Li instructions Ta Li INSTR_EXECUTED Ta +.It Li branches Ta Li BRANCH_COMPLETED Ta +.It Li branch-mispredicts Ta Li BRANCH_MISPRED Ta +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.iaf 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmc.tsc 3 , +.Xr pmc_cpuinfo 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . +MIPS support was added by +.An "George Neville-Neil" +.Aq gnn@FreeBSD.org . +.Sh CAVEATS +The MIPS code does not yet support sampling. diff --git a/lib/libpmc/pmc.p4.3 b/lib/libpmc/pmc.p4.3 new file mode 100644 index 0000000..e13fa6e --- /dev/null +++ b/lib/libpmc/pmc.p4.3 @@ -0,0 +1,1226 @@ +.\" Copyright (c) 2003-2008 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd October 4, 2008 +.Dt PMC.P4 3 +.Os +.Sh NAME +.Nm pmc.p4 +.Nd measurement events for +.Tn "Intel Pentium 4" +and other +.Tn Netburst +architecture CPUs +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +Intel P4 PMCs are present in Intel +.Tn "Pentium 4" +and +.Tn Xeon +processors that use the +.Tn Netburst +CPU architecture. +.Pp +These PMCs are documented in +.Rs +.%B "IA-32 Intel(R) Architecture Software Developer's Manual" +.%T "Volume 3: System Programming Guide" +.%N "Order Number 245472-012" +.%D 2003 +.%Q "Intel Corporation" +.Re +Further information about using these PMCs may be found in +.Rs +.%B "IA-32 Intel(R) Architecture Optimization Guide" +.%D 2003 +.%N "Order Number 248966-009" +.%Q "Intel Corporation" +.Re +Some of these events are affected by processor errata described in +.Rs +.%B "Intel(R) Pentium(R) 4 Processor Specification Update" +.%N "Document Number: 249199-059" +.%D "April 2005" +.%Q "Intel Corporation" +.Re +.Ss PMC Features +Intel Pentium 4 PMCs are 40 bits wide. +Each CPU contains 18 PMCs, divided into 4 groups with 4, 4, 4 and 6 +PMCs respectively. +On processors with hyperthreading support, PMC resources are shared +between logical processors. +These PMCs support the following capabilities: +.Bl -column "PMC_CAP_INTERRUPT" "Support" +.It Em Capability Ta Em Support +.It PMC_CAP_CASCADE Ta Yes +.It PMC_CAP_EDGE Ta Yes +.It PMC_CAP_INTERRUPT Ta Yes +.It PMC_CAP_INVERT Ta Yes +.It PMC_CAP_READ Ta Yes +.It PMC_CAP_PRECISE Ta Unimplemented +.It PMC_CAP_SYSTEM Ta Yes +.It PMC_CAP_TAGGING Ta Yes +.It PMC_CAP_THRESHOLD Ta Yes +.It PMC_CAP_USER Ta Yes +.It PMC_CAP_WRITE Ta Yes +.El +.Ss Event Qualifiers +.Pp +Event specifiers for Intel P4 PMCs can have the following common +qualifiers: +.Bl -tag -width indent +.It Li active= Ns Ar choice +(On P4 HTT CPUs) Filter event counting based on which logical +processors are active. +The allowed values of +.Ar choice +are: +.Pp +.Bl -tag -width indent -compact +.It Li any +Count when either logical processor is active. +.It Li both +Count when both logical processors are active. +.It Li none +Count only when neither logical processor is active. +.It Li single +Count only when one logical processor is active. +.El +.Pp +The default is +.Dq Li both . +.It Li cascade +Configure the PMC to cascade onto its partner. +See +.Sx "Cascading P4 PMCs" +below for more information. +.It Li edge +Configure the counter to count false to true transitions of the threshold +comparison output. +This qualifier only takes effect if a threshold qualifier has also been +specified. +.It Li complement +Configure the counter to increment only when the event count seen is +less than the threshold qualifier value specified. +.It Li mask= Ns Ar qualifier +Many event specifiers for Intel P4 PMCs need to be additionally +qualified using a mask qualifier. +The allowed syntax for these qualifiers is event specific and is +described along with the events. +.It Li os +Configure the PMC to count when the CPL of the processor is 0. +.It Li precise +Select precise event based sampling. +Precise sampling is supported by the hardware for a limited set of +events. +.It Li tag= Ns Ar value +Configure the PMC to tag the internal uop selected by the other +fields in this event specifier with value +.Ar value . +This feature is used when cascading PMCs. +.It Li threshold= Ns Ar value +Configure the PMC to increment only when the event counts seen are +greater than the specified threshold value +.Ar value . +.It Li usr +Configure the PMC to count when the CPL of the processor is 1, 2 or 3. +.El +.Pp +If neither of the +.Dq Li os +or +.Dq Li usr +qualifiers are specified, the default is to enable both. +.Pp +On Intel Pentium 4 processors with HTT, events are +divided into two classes: +.Pp +.Bl -tag -width indent -compact +.It "TS Events" +are those where hardware can differentiate between events +generated on one logical processor from those generated on the +other. +.It "TI Events" +are those where hardware cannot differentiate between events +generated by multiple logical processors in a package. +.El +.Pp +Only TS events are allowed for use with process-mode PMCs on +Pentium-4/HTT CPUs. +.Pp +The event specifiers supported by Intel P4 PMCs are: +.Pp +.Bl -tag -width indent +.It Li p4-128bit-mmx-uop Op Li ,mask= Ns Ar flags +.Pq "TI event" +Count integer SIMD SSE2 instructions that operate on 128 bit SIMD +operands. +Qualifier +.Ar flags +can take the following value (which is also the default): +.Pp +.Bl -tag -width indent -compact +.It Li all +Count all uops operating on 128 bit SIMD integer operands in memory or +XMM register. +.El +.Pp +If an instruction contains more than one 128 bit MMX uop, then each +uop will be counted. +.It Li p4-64bit-mmx-uop Op Li ,mask= Ns Ar flags +.Pq "TI event" +Count MMX instructions that operate on 64 bit SIMD operands. +Qualifier +.Ar flags +can take the following value (which is also the default): +.Pp +.Bl -tag -width indent -compact +.It Li all +Count all uops operating on 64 bit SIMD integer operands in memory or +in MMX registers. +.El +.Pp +If an instruction contains more than one 64 bit MMX uop, then each +uop will be counted. +.It Li p4-b2b-cycles +.Pq "TI event" +Count back-to-back bus cycles. +Further documentation for this event is unavailable. +.It Li p4-bnr +.Pq "TI event" +Count bus-not-ready conditions. +Further documentation for this event is unavailable. +.It Li p4-bpu-fetch-request Op Li ,mask= Ns Ar qualifier +.Pq "TS event" +Count instruction fetch requests qualified by additional +flags specified in +.Ar qualifier . +At this point only one flag is supported: +.Pp +.Bl -tag -width indent -compact +.It Li tcmiss +Count trace cache lookup misses. +.El +.Pp +The default qualifier is also +.Dq Li mask=tcmiss . +.It Li p4-branch-retired Op Li ,mask= Ns Ar flags +.Pq "TS event" +Counts retired branches. +Qualifier +.Ar flags +is a list of the following +.Ql + +separated strings: +.Pp +.Bl -tag -width indent -compact +.It Li mmnp +Count branches not-taken and predicted. +.It Li mmnm +Count branches not-taken and mis-predicted. +.It Li mmtp +Count branches taken and predicted. +.It Li mmtm +Count branches taken and mis-predicted. +.El +.Pp +The default qualifier counts all four kinds of branches. +.It Li p4-bsq-active-entries Op Li ,mask= Ns Ar qualifier +.Pq "TS event" +Count the number of entries (clipped at 15) currently active in the +BSQ. +Qualifier +.Ar qualifier +is a +.Ql + +separated set of the following flags: +.Pp +.Bl -tag -width indent -compact +.It Li req-type0 , Li req-type1 +Forms a 2-bit number used to select the request type encoding: +.Pp +.Bl -tag -width indent -compact +.It Li 0 +reads excluding read invalidate +.It Li 1 +read invalidates +.It Li 2 +writes other than writebacks +.It Li 3 +writebacks +.El +.Pp +Bit +.Dq Li req-type1 +is the MSB for this two bit number. +.It Li req-len0 , Li req-len1 +Forms a two-bit number that specifies the request length encoding: +.Pp +.Bl -tag -width indent -compact +.It Li 0 +0 chunks +.It Li 1 +1 chunk +.It Li 3 +8 chunks +.El +.Pp +Bit +.Dq Li req-len1 +is the MSB for this two bit number. +.It Li req-io-type +Count requests that are input or output requests. +.It Li req-lock-type +Count requests that lock the bus. +.It Li req-lock-cache +Count requests that lock the cache. +.It Li req-split-type +Count requests that is a bus 8-byte chunk that is split across an +8-byte boundary. +.It Li req-dem-type +Count requests that are demand (not prefetches) if set. +Count requests that are prefetches if not set. +.It Li req-ord-type +Count requests that are ordered. +.It Li mem-type0 , Li mem-type1 , Li mem-type2 +Forms a 3-bit number that specifies a memory type encoding: +.Pp +.Bl -tag -width indent -compact +.It Li 0 +UC +.It Li 1 +USWC +.It Li 4 +WT +.It Li 5 +WP +.It Li 6 +WB +.El +.Pp +Bit +.Dq Li mem-type2 +is the MSB of this 3-bit number. +.El +.Pp +The default qualifier has all the above bits set. +.Pp +Edge triggering using the +.Dq Li edge +qualifier should not be used with this event when counting cycles. +.It Li p4-bsq-allocation Op Li ,mask= Ns Ar qualifier +.Pq "TS event" +Count allocations in the bus sequence unit according to the flags +specified in +.Ar qualifier , +which is a +.Ql + +separated set of the following flags: +.Pp +.Bl -tag -width indent -compact +.It Li req-type0 , Li req-type1 +Forms a 2-bit number used to select the request type encoding: +.Pp +.Bl -tag -width indent -compact +.It Li 0 +reads excluding read invalidate +.It Li 1 +read invalidates +.It Li 2 +writes other than writebacks +.It Li 3 +writebacks +.El +.Pp +Bit +.Dq Li req-type1 +is the MSB for this two bit number. +.It Li req-len0 , Li req-len1 +Forms a two-bit number that specifies the request length encoding: +.Pp +.Bl -tag -width indent -compact +.It Li 0 +0 chunks +.It Li 1 +1 chunk +.It Li 3 +8 chunks +.El +.Pp +Bit +.Dq Li req-len1 +is the MSB for this two bit number. +.It Li req-io-type +Count requests that are input or output requests. +.It Li req-lock-type +Count requests that lock the bus. +.It Li req-lock-cache +Count requests that lock the cache. +.It Li req-split-type +Count requests that is a bus 8-byte chunk that is split across an +8-byte boundary. +.It Li req-dem-type +Count requests that are demand (not prefetches) if set. +Count requests that are prefetches if not set. +.It Li req-ord-type +Count requests that are ordered. +.It Li mem-type0 , Li mem-type1 , Li mem-type2 +Forms a 3-bit number that specifies a memory type encoding: +.Pp +.Bl -tag -width indent -compact +.It Li 0 +UC +.It Li 1 +USWC +.It Li 4 +WT +.It Li 5 +WP +.It Li 6 +WB +.El +.Pp +Bit +.Dq Li mem-type2 +is the MSB of this 3-bit number. +.El +.Pp +The default qualifier has all the above bits set. +.Pp +This event is usually used along with the +.Dq Li edge +qualifier to avoid multiple counting. +.It Li p4-bsq-cache-reference Op Li ,mask= Ns Ar qualifier +.Pq "TS event" +Count cache references as seen by the bus unit (2nd or 3rd level +cache references). +Qualifier +.Ar qualifier +is a +.Ql + +separated list of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li rd-2ndl-hits +Count 2nd level cache hits in the shared state. +.It Li rd-2ndl-hite +Count 2nd level cache hits in the exclusive state. +.It Li rd-2ndl-hitm +Count 2nd level cache hits in the modified state. +.It Li rd-3rdl-hits +Count 3rd level cache hits in the shared state. +.It Li rd-3rdl-hite +Count 3rd level cache hits in the exclusive state. +.It Li rd-3rdl-hitm +Count 3rd level cache hits in the modified state. +.It Li rd-2ndl-miss +Count 2nd level cache misses. +.It Li rd-3rdl-miss +Count 3rd level cache misses. +.It Li wr-2ndl-miss +Count write-back lookups from the data access cache that miss the 2nd +level cache. +.El +.Pp +The default is to count all the above events. +.It Li p4-execution-event Op Li ,mask= Ns Ar flags +.Pq "TS event" +Count the retirement of tagged uops selected through the execution +tagging mechanism. +Qualifier +.Ar flags +can contain the following strings separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li nbogus0 , Li nbogus1 , Li nbogus2 , Li nbogus3 +The marked uops are not bogus. +.It Li bogus0 , Li bogus1 , Li bogus2 , Li bogus3 +The marked uops are bogus. +.El +.Pp +This event requires additional (upstream) events to be allocated to +perform the desired uop tagging. +The default is to set all the above flags. +This event can be used for precise event based sampling. +.It Li p4-front-end-event Op Li ,mask= Ns Ar flags +.Pq "TS event" +Count the retirement of tagged uops selected through the front-end +tagging mechanism. +Qualifier +.Ar flags +can contain the following strings separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li nbogus +The marked uops are not bogus. +.It Li bogus +The marked uops are bogus. +.El +.Pp +This event requires additional (upstream) events to be allocated to +perform the desired uop tagging. +The default is to select both kinds of events. +This event can be used for precise event based sampling. +.It Li p4-fsb-data-activity Op Li ,mask= Ns Ar flags +.Pq "TI event" +Count each DBSY or DRDY event selected by qualifier +.Ar flags . +Qualifier +.Ar flags +is a +.Ql + +separated set of the following flags: +.Pp +.Bl -tag -width indent -compact +.It Li drdy-drv +Count when this processor is driving data onto the bus. +.It Li drdy-own +Count when this processor is reading data from the bus. +.It Li drdy-other +Count when data is on the bus but not being sampled by this processor. +.It Li dbsy-drv +Count when this processor reserves the bus for use in the next cycle +in order to drive data. +.It Li dbsy-own +Count when some agent reserves the bus for use in the next bus cycle +to drive data that this processor will sample. +.It Li dbsy-other +Count when some agent reserves the bus for use in the next bus cycle +to drive data that this processor will not sample. +.El +.Pp +Flags +.Dq Li drdy-own +and +.Dq Li drdy-other +are mutually exclusive. +Flags +.Dq Li dbsy-own +and +.Dq Li dbsy-other +are mutually exclusive. +The default value for +.Ar qualifier +is +.Dq Li drdy-drv+drdy-own+dbsy-drv+dbsy-own . +.It Li p4-global-power-events Op Li ,mask= Ns Ar flags +.Pq "TS event" +Count cycles during which the processor is not stopped. +Qualifier +.Ar flags +can take the following value (which is also the default): +.Pp +.Bl -tag -width indent -compact +.It Li running +Count cycles when the processor is active. +.El +.Pp +.It Li p4-instr-retired Op Li ,mask= Ns Ar flags +.Pq "TS event" +Count instructions retired during a clock cycle. +Qualifier +.Ar flags +comprises of the following strings separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li nbogusntag +Count non-bogus instructions that are not tagged. +.It Li nbogustag +Count non-bogus instructions that are tagged. +.It Li bogusntag +Count bogus instructions that are not tagged. +.It Li bogustag +Count bogus instructions that are tagged. +.El +.Pp +The default qualifier counts all the above kinds of instructions. +.It Li p4-ioq-active-entries Xo +.Op Li ,mask= Ns Ar qualifier +.Op Li ,busreqtype= Ns Ar req-type +.Xc +.Pq "TS event" +Count the number of entries (clipped at 15) in the IOQ that are +active. +The event masks are specified by qualifier +.Ar qualifier +and +.Ar req-type . +.Pp +Qualifier +.Ar qualifier +is a +.Ql + +separated set of the following flags: +.Pp +.Bl -tag -width indent -compact +.It Li all-read +Count read entries. +.It Li all-write +Count write entries. +.It Li mem-uc +Count entries accessing un-cacheable memory. +.It Li mem-wc +Count entries accessing write-combining memory. +.It Li mem-wt +Count entries accessing write-through memory. +.It Li mem-wp +Count entries accessing write-protected memory +.It Li mem-wb +Count entries accessing write-back memory. +.It Li own +Count store requests driven by the processor (i.e., not by other +processors or by DMA). +.It Li other +Count store requests driven by other processors or by DMA. +.It Li prefetch +Include hardware and software prefetch requests in the count. +.El +.Pp +The default value for +.Ar qualifier +is to enable all the above flags. +.Pp +The +.Ar req-type +qualifier is a 5-bit number can be additionally used to select a +specific bus request type. +The default is 0. +.Pp +The +.Dq Li edge +qualifier should not be used when counting cycles with this event. +The exact behavior of this event depends on the processor revision. +.It Li p4-ioq-allocation Xo +.Op Li ,mask= Ns Ar qualifier +.Op Li ,busreqtype= Ns Ar req-type +.Xc +.Pq "TS event" +Count various types of transactions on the bus matching the flags set +in +.Ar qualifier +and +.Ar req-type . +.Pp +Qualifier +.Ar qualifier +is a +.Ql + +separated set of the following flags: +.Pp +.Bl -tag -width indent -compact +.It Li all-read +Count read entries. +.It Li all-write +Count write entries. +.It Li mem-uc +Count entries accessing un-cacheable memory. +.It Li mem-wc +Count entries accessing write-combining memory. +.It Li mem-wt +Count entries accessing write-through memory. +.It Li mem-wp +Count entries accessing write-protected memory +.It Li mem-wb +Count entries accessing write-back memory. +.It Li own +Count store requests driven by the processor (i.e., not by other +processors or by DMA). +.It Li other +Count store requests driven by other processors or by DMA. +.It Li prefetch +Include hardware and software prefetch requests in the count. +.El +.Pp +The default value for +.Ar qualifier +is to enable all the above flags. +.Pp +The +.Ar req-type +qualifier is a 5-bit number can be additionally used to select a +specific bus request type. +The default is 0. +.Pp +The +.Dq Li edge +qualifier is normally used with this event to prevent multiple +counting. +The exact behavior of this event depends on the processor revision. +.It Li p4-itlb-reference Op mask= Ns Ar qualifier +.Pq "TS event" +Count translations using the instruction translation look-aside +buffer. +The +.Ar qualifier +argument is a list of the following strings separated by +.Ql + +characters. +.Pp +.Bl -tag -width indent -compact +.It Li hit +Count ITLB hits. +.It Li miss +Count ITLB misses. +.It Li hit-uc +Count un-cacheable ITLB hits. +.El +.Pp +If no +.Ar qualifier +is specified the default is to count all the three kinds of ITLB +translations. +.It Li p4-load-port-replay Op Li ,mask= Ns Ar qualifier +.Pq "TS event" +Count replayed events at the load port. +Qualifier +.Ar qualifier +can take on one value: +.Pp +.Bl -tag -width indent -compact +.It Li split-ld +Count split loads. +.El +.Pp +The default value for +.Ar qualifier +is +.Dq Li split-ld . +.It Li p4-mispred-branch-retired Op Li ,mask= Ns Ar flags +.Pq "TS event" +Count mispredicted IA-32 branch instructions. +Qualifier +.Ar flags +can take the following value (which is also the default): +.Pp +.Bl -tag -width indent -compact +.It Li nbogus +Count non-bogus retired branch instructions. +.El +.It Li p4-machine-clear Op Li ,mask= Ns Ar flags +.Pq "TS event" +Count the number of pipeline clears seen by the processor. +Qualifier +.Ar flags +is a list of the following strings separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li clear +Count for a portion of the many cycles when the machine is being +cleared for any reason. +.It Li moclear +Count machine clears due to memory ordering issues. +.It Li smclear +Count machine clears due to self-modifying code. +.El +.Pp +Use qualifier +.Dq Li edge +to get a count of occurrences of machine clears. +The default qualifier is +.Dq Li clear . +.It Li p4-memory-cancel Op Li ,mask= Ns Ar event-list +.Pq "TS event" +Count the canceling of various kinds of requests in the data cache +address control unit of the CPU. +The qualifier +.Ar event-list +is a list of the following strings separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li st-rb-full +Requests cancelled because no store request buffer was available. +.It Li 64k-conf +Requests that conflict due to 64K aliasing. +.El +.Pp +If +.Ar event-list +is not specified, then the default is to count both kinds of events. +.It Li p4-memory-complete Op Li ,mask= Ns Ar event-list +.Pq "TS event" +Count the completion of load split, store split, un-cacheable split and +un-cacheable load operations selected by qualifier +.Ar event-list . +The qualifier +.Ar event-list +is a +.Ql + +separated list of the following flags: +.Pp +.Bl -tag -width indent -compact +.It Li lsc +Count load splits completed, excluding loads from un-cacheable or +write-combining areas. +.It Li ssc +Count any split stores completed. +.El +.Pp +The default is to count both kinds of operations. +.It Li p4-mob-load-replay Op Li ,mask= Ns Ar qualifier +.Pq "TS event" +Count load replays triggered by the memory order buffer. +Qualifier +.Ar qualifier +can be a +.Ql + +separated list of the following flags: +.Pp +.Bl -tag -width indent -compact +.It Li no-sta +Count replays because of unknown store addresses. +.It Li no-std +Count replays because of unknown store data. +.It Li partial-data +Count replays because of partially overlapped data accesses between +load and store operations. +.It Li unalgn-addr +Count replays because of mismatches in the lower 4 bits of load and +store operations. +.El +.Pp +The default qualifier is +.Ar no-sta+no-std+partial-data+unalgn-addr . +.It Li p4-packed-dp-uop Op Li ,mask= Ns Ar flags +.Pq "TI event" +Count packed double-precision uops. +Qualifier +.Ar flags +can take the following value (which is also the default): +.Pp +.Bl -tag -width indent -compact +.It Li all +Count all uops operating on packed double-precision operands. +.El +.It Li p4-packed-sp-uop Op Li ,mask= Ns Ar flags +.Pq "TI event" +Count packed single-precision uops. +Qualifier +.Ar flags +can take the following value (which is also the default): +.Pp +.Bl -tag -width indent -compact +.It Li all +Count all uops operating on packed single-precision operands. +.El +.It Li p4-page-walk-type Op Li ,mask= Ns Ar qualifier +.Pq "TI event" +Count page walks performed by the page miss handler. +Qualifier +.Ar qualifier +can be a +.Ql + +separated list of the following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li dtmiss +Count page walks for data TLB misses. +.It Li itmiss +Count page walks for instruction TLB misses. +.El +.Pp +The default value for +.Ar qualifier +is +.Dq Li dtmiss+itmiss . +.It Li p4-replay-event Op Li ,mask= Ns Ar flags +.Pq "TS event" +Count the retirement of tagged uops selected through the replay +tagging mechanism. +Qualifier +.Ar flags +contains a +.Ql + +separated set of the following strings: +.Pp +.Bl -tag -width indent -compact +.It Li nbogus +The marked uops are not bogus. +.It Li bogus +The marked uops are bogus. +.El +.Pp +This event requires additional (upstream) events to be allocated to +perform the desired uop tagging. +The default qualifier counts both kinds of uops. +This event can be used for precise event based sampling. +.It Li p4-resource-stall Op Li ,mask= Ns Ar flags +.Pq "TS event" +Count the occurrence or latency of stalls in the allocator. +Qualifier +.Ar flags +can take the following value (which is also the default): +.Pp +.Bl -tag -width indent -compact +.It Li sbfull +A stall due to the lack of store buffers. +.El +.It Li p4-response +.Pq "TI event" +Count different types of responses. +Further documentation on this event is not available. +.It Li p4-retired-branch-type Op Li ,mask= Ns Ar flags +.Pq "TS event" +Count branches retired. +Qualifier +.Ar flags +contains a +.Ql + +separated list of strings: +.Pp +.Bl -tag -width indent -compact +.It Li conditional +Count conditional jumps. +.It Li call +Count direct and indirect call branches. +.It Li return +Count return branches. +.It Li indirect +Count returns, indirect calls or indirect jumps. +.El +.Pp +The default qualifier counts all the above branch types. +.It Li p4-retired-mispred-branch-type Op Li ,mask= Ns Ar flags +.Pq "TS event" +Count mispredicted branches retired. +Qualifier +.Ar flags +contains a +.Ql + +separated list of strings: +.Pp +.Bl -tag -width indent -compact +.It Li conditional +Count conditional jumps. +.It Li call +Count indirect call branches. +.It Li return +Count return branches. +.It Li indirect +Count returns, indirect calls or indirect jumps. +.El +.Pp +The default qualifier counts all the above branch types. +.It Li p4-scalar-dp-uop Op Li ,mask= Ns Ar flags +.Pq "TI event" +Count the number of scalar double-precision uops. +Qualifier +.Ar flags +can take the following value (which is also the default): +.Pp +.Bl -tag -width indent -compact +.It Li all +Count the number of scalar double-precision uops. +.El +.It Li p4-scalar-sp-uop Op Li ,mask= Ns Ar flags +.Pq "TI event" +Count the number of scalar single-precision uops. +Qualifier +.Ar flags +can take the following value (which is also the default): +.Pp +.Bl -tag -width indent -compact +.It Li all +Count all uops operating on scalar single-precision operands. +.El +.It Li p4-snoop +.Pq "TI event" +Count snoop traffic. +Further documentation on this event is not available. +.It Li p4-sse-input-assist Op Li ,mask= Ns Ar flags +.Pq "TI event" +Count the number of times an assist is required to handle problems +with the operands for SSE and SSE2 operations. +Qualifier +.Ar flags +can take the following value (which is also the default): +.Pp +.Bl -tag -width indent -compact +.It Li all +Count assists for all SSE and SSE2 uops. +.El +.It Li p4-store-port-replay Op Li ,mask= Ns Ar qualifier +.Pq "TS event" +Count events replayed at the store port. +Qualifier +.Ar qualifier +can take on one value: +.Pp +.Bl -tag -width indent -compact +.It Li split-st +Count split stores. +.El +.Pp +The default value for +.Ar qualifier +is +.Dq Li split-st . +.It Li p4-tc-deliver-mode Op Li ,mask= Ns Ar qualifier +.Pq "TI event" +Count the duration in cycles of operating modes of the trace cache and +decode engine. +The desired operating mode is selected by +.Ar qualifier , +which is a list of the following strings separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li DD +Both logical processors are in deliver mode. +.It Li DB +Logical processor 0 is in deliver mode while logical processor 1 is in +build mode. +.It Li DI +Logical processor 0 is in deliver mode while logical processor 1 is +halted, or in machine clear, or transitioning to a long microcode +flow. +.It Li BD +Logical processor 0 is in build mode while logical processor 1 is in +deliver mode. +.It Li BB +Both logical processors are in build mode. +.It Li BI +Logical processor 0 is in build mode while logical processor 1 is +halted, or in machine clear or transitioning to a long microcode +flow. +.It Li ID +Logical processor 0 is halted, or in machine clear or transitioning to +a long microcode flow while logical processor 1 is in deliver mode. +.It Li IB +Logical processor 0 is halted, or in machine clear or transitioning to +a long microcode flow while logical processor 1 is in build mode. +.El +.Pp +If there is only one logical processor in the processor package then +the qualifier for logical processor 1 is ignored. +If no qualifier is specified, the default qualifier is +.Dq Li DD+DB+DI+BD+BB+BI+ID+IB . +.It Li p4-tc-ms-xfer Op Li ,mask= Ns Ar flags +.Pq "TI event" +Count the number of times uop delivery changed from the trace cache to +MS ROM. +Qualifier +.Ar flags +can take the following value (which is also the default): +.Pp +.Bl -tag -width indent -compact +.It Li cisc +Count TC to MS transfers. +.El +.It Li p4-uop-queue-writes Op Li ,mask= Ns Ar flags +.Pq "TS event" +Count the number of valid uops written to the uop queue. +Qualifier +.Ar flags +is a list of the following strings, separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li from-tc-build +Count uops being written from the trace cache in build mode. +.It Li from-tc-deliver +Count uops being written from the trace cache in deliver mode. +.It Li from-rom +Count uops being written from microcode ROM. +.El +.Pp +The default qualifier counts all the above kinds of uops. +.It Li p4-uop-type Op Li ,mask= Ns Ar flags +.Pq "TS event" +This event is used in conjunction with the front-end at-retirement +mechanism to tag load and store uops. +Qualifier +.Ar flags +comprises the following strings separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li tagloads +Mark uops that are load operations. +.It Li tagstores +Mark uops that are store operations. +.El +.Pp +The default qualifier counts both kinds of uops. +.It Li p4-uops-retired Op Li ,mask= Ns Ar flags +.Pq "TS event" +Count uops retired during a clock cycle. +Qualifier +.Ar flags +comprises the following strings separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li nbogus +Count marked uops that are not bogus. +.It Li bogus +Count marked uops that are bogus. +.El +.Pp +The default qualifier counts both kinds of uops. +.It Li p4-wc-buffer Op Li ,mask= Ns Ar flags +.Pq "TI event" +Count write-combining buffer operations. +Qualifier +.Ar flags +contains the following strings separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li wcb-evicts +WC buffer evictions due to any cause. +.It Li wcb-full-evict +WC buffer evictions due to no WC buffer being available. +.El +.Pp +The default qualifier counts both kinds of evictions. +.It Li p4-x87-assist Op Li ,mask= Ns Ar flags +.Pq "TS event" +Count the retirement of x87 instructions that required special +handling. +Qualifier +.Ar flags +contains the following strings separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li fpsu +Count instructions that saw an FP stack underflow. +.It Li fpso +Count instructions that saw an FP stack overflow. +.It Li poao +Count instructions that saw an x87 output overflow. +.It Li poau +Count instructions that saw an x87 output underflow. +.It Li prea +Count instructions that needed an x87 input assist. +.El +.Pp +The default qualifier counts all the above types of instruction +retirements. +.It Li p4-x87-fp-uop Op Li ,mask= Ns Ar flags +.Pq "TI event" +Count x87 floating-point uops. +Qualifier +.Ar flags +can take the following value (which is also the default): +.Pp +.Bl -tag -width indent -compact +.It Li all +Count all x87 floating-point uops. +.El +.Pp +If an instruction contains more than one x87 floating-point uops, then +all x87 floating-point uops will be counted. +This event does not count x87 floating-point data movement operations. +.It Li p4-x87-simd-moves-uop Op Li ,mask= Ns Ar flags +.Pq "TI event" +Count each x87 FPU, MMX, SSE, or SSE2 uops that load data or store +data or perform register-to-register moves. +This event does not count integer move uops. +Qualifier +.Ar flags +may contain the following keywords separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li allp0 +Count all x87 and SIMD store and move uops. +.It Li allp2 +Count all x87 and SIMD load uops. +.El +.Pp +The default is to count all uops. +.Pq Errata +This event may be affected by processor errata N43. +.El +.Ss "Cascading P4 PMCs" +PMC cascading support is currently poorly implemented. +While individual event counters may be allocated with a +.Dq Li cascade +qualifier, the current API does not offer the ability +to name and allocate all the resources needed for a +cascaded event counter pair in a single operation. +.Ss "Precise Event Based Sampling" +Support for precise event based sampling is currently +unimplemented. +.Ss Event Name Aliases +The following table shows the mapping between the PMC-independent +aliases supported by +.Lb libpmc +and the underlying hardware events used. +.Bl -column "branch-mispredicts" "Description" +.It Em Alias Ta Em Event +.It Li branches Ta Li p4-branch-retired,mask=mmtp+mmtm +.It Li branch-mispredicts Ta Li p4-mispred-branch-retired +.It Li dc-misses Ta (unsupported) +.It Li ic-misses Ta (unsupported) +.It Li instructions Ta Li p4-instr-retired,mask=nbogusntag+nbogustag +.It Li interrupts Ta Li (unsupported) +.It Li unhalted-cycles Ta Li p4-global-power-events +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.core2 3 , +.Xr pmc.iaf 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmc.tsc 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . diff --git a/lib/libpmc/pmc.p5.3 b/lib/libpmc/pmc.p5.3 new file mode 100644 index 0000000..36ab917 --- /dev/null +++ b/lib/libpmc/pmc.p5.3 @@ -0,0 +1,460 @@ +.\" Copyright (c) 2003-2008 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd October 4, 2008 +.Dt PMC 3 +.Os +.Sh NAME +.Nm pmc +.Nd library for accessing hardware performance monitoring counters +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +Intel Pentium PMCs are present in Intel +.Tn Pentium +and +.Tn "Pentium MMX" +processors. +These PMCs are documented in the +.Rs +.%B "Intel 64 and IA-32 Intel(R) Architectures Software Developer's Manual" +.%T "Volume 3B: System Programming Guide, Part 2" +.%N "Order Number 253669-024US" +.%D "August 2007" +.%Q "Intel Corporation" +.Re +.Ss PMC Features +These CPUs contain two PMCs, each 40 bits wide. +These PMCs support the following capabilities: +.Bl -column "PMC_CAP_INTERRUPT" "Support" +.It Em Capability Ta Em Support +.It PMC_CAP_CASCADE Ta \&No +.It PMC_CAP_EDGE Ta \&No +.It PMC_CAP_INTERRUPT Ta \&No +.It PMC_CAP_INVERT Ta \&No +.It PMC_CAP_READ Ta Yes +.It PMC_CAP_PRECISE Ta \&No +.It PMC_CAP_SYSTEM Ta Yes +.It PMC_CAP_TAGGING Ta \&No +.It PMC_CAP_THRESHOLD Ta \&No +.It PMC_CAP_USER Ta Yes +.It PMC_CAP_WRITE Ta Yes +.El +.Ss Event Qualifiers +Event specifiers for Intel Pentium PMCs can have the following common +qualifiers: +.Bl -tag -width indent +.It Li duration +Count duration (in clocks) of events. +The default is to count events. +.It Li os +Measure events at privilege levels 0, 1 and 2. +.It Li overflow +Assert the external processor pin associated with a counter on counter +overflow. +.It Li usr +Measure events at privilege level 3. +.El +.Pp +If neither of the +.Dq Li os +or +.Dq Li usr +qualifiers are specified, the default is to enable both. +.Pp +Some events may only be used on specific counters and some events +are defined only on processors supporting the MMX instruction set. +Note that these PMCs do not have the ability to interrupt the CPU. +.Ss Intel Pentium Event Specifiers +The event specifiers supported by Intel Pentium PMCs are: +.Bl -tag -width indent +.It Li p5-any-segment-register-loaded +.Pq Event 0FH +The number of writes to any segment register, including the LDTR, +GDTR, TR and IDTR. +Far control transfers and task switches that involve privilege +level changes will count this event twice. +.It Li p5-bank-conflicts +.Pq Event 0AH +The number of actual bank conflicts. +.It Li p5-branches +.Pq Event 12H +The number of taken and not taken branches including branches, jumps, calls, +software interrupts and interrupt returns. +.It Li p5-breakpoint-match-on-dr0-register +.Pq Event 23H +The number of matches on the DR0 breakpoint register. +.It Li p5-breakpoint-match-on-dr1-register +.Pq Event 24H +The number of matches on the DR1 breakpoint register. +.It Li p5-breakpoint-match-on-dr2-register +.Pq Event 25H +The number of matches on the DR2 breakpoint register. +.It Li p5-breakpoint-match-on-dr3-register +.Pq Event 26H +The number of matches on the DR3 breakpoint register. +.It Li p5-btb-false-entries +.Pq Event 3AH , Tn Pentium MMX +The number of false entries in the BTB. +This event is only allocated on counter 0. +.It Li p5-btb-hits +.Pq Event 13H +The number of branches executed that hit in the branch table buffer. +.It Li p5-btb-miss-prediction-on-not-taken-branch +.Pq Event 3AH , Tn Pentium MMX +The number of times the BTB predicted a not-taken branch as taken. +This event is only allocated on counter 1. +.It Li p5-bus-cycle-duration +.Pq Event 18H +The number of cycles while a bus cycle was in progress. +.It Li p5-bus-ownership-latency +.Pq Event 2AH , Tn Pentium MMX +The time from bus ownership being requested to ownership being granted. +This event is only allocated on counter 0. +.It Li p5-bus-ownership-transfers +.Pq Event 2AH , Tn Pentium MMX +The number of bus ownership transfers. +This event is only allocated on counter 1. +.It Li p5-bus-utilization-due-to-processor-activity +.Pq Event 2EH , Tn Pentium MMX +The number of clocks the bus is busy due to the processor's own +activity. +This event is only allocated on counter 0. +.It Li p5-cache-line-sharing +.Pq Event 2CH , Tn Pentium MMX +The number of shared data lines in L1 cache. +This event is only allocated on counter 1. +.It Li p5-cache-m-state-line-sharing +.Pq Event 2CH , Tn Pentium MMX +The number of hits to an M- state line due to a memory access by +another processor. +This event is only allocated on counter 0. +.It Li p5-code-cache-miss +.Pq Event 0EH +The number of instruction reads that miss the internal code cache. +Both cacheable and un-cacheable misses are counted. +.It Li p5-code-read +.Pq Event 0CH +The number of instruction reads to both cacheable and un-cacheable regions. +.It Li p5-code-tlb-miss +.Pq Event 0DH +The number of instruction reads that miss the instruction TLB. +Both cacheable and un-cacheable unreads are counted. +.It Li p5-d1-starvation-and-fifo-is-empty +.Pq Event 33H , Tn Pentium MMX +The number of times the D1 stage cannot issue any instructions because +the FIFO was empty. +This event is only allocated on counter 0. +.It Li p5-d1-starvation-and-only-one-instruction-in-fifo +.Pq Event 33H , Tn Pentium MMX +The number of times the D1 stage could issue only one instruction +because the FIFO had one instruction ready. +This event is only allocated on counter 1. +.It Li p5-data-cache-lines-written-back +.Pq Event 06H +The number of data cache lines that are written back, including +those caused by internal and external snoops. +.It Li p5-data-cache-tlb-miss-stall-duration +.Pq Event 30H , Tn Pentium MMX +The number of clocks the pipeline is stalled due to a data cache +TLB miss. +This event is only allocated on counter 1. +.It Li p5-data-read +.Pq Event 00H +The number of memory data reads, counting internal data cache hits and +misses. +I/O and data memory accesses due to TLB miss processing are +not included. +Split cycle reads are counted individually. +.It Li p5-data-read-miss +.Pq Event 03H +The number of memory read accesses that miss the data cache, counting +both cacheable and un-cacheable accesses. +Data accesses that are part of TLB miss processing are not included. +I/O accesses are not included. +.It Li p5-data-read-miss-or-write-miss +.Pq Event 29H +The number of data reads and writes that miss the internal data cache, +counting un-cacheable accesses. +Data accesses due to TLB miss processing are not counted. +.It Li p5-data-read-or-write +.Pq Event 28H +The number of data reads and writes including internal data cache hits +and misses. +Data reads due to TLB miss processing are not counted. +.It Li p5-data-tlb-miss +.Pq Event 02H +The number of misses to the data cache translation look aside buffer. +.It Li p5-data-write +.Pq Event 01H +The number of memory data writes, counting internal data cache hits +and misses. +I/O is not included and split cycle writes are counted individually. +.It Li p5-data-write-miss +.Pq Event 04H +The number of memory write accesses that miss the data cache, counting +both cacheable and un-cacheable accesses. +I/O accesses are not counted. +.It Li p5-emms-instructions-executed +.Pq Event 2DH , Tn Pentium MMX +The number of EMMS instructions executed. +This event is only allocated on counter 0. +.It Li p5-external-data-cache-snoop-hits +.Pq Event 08H +The number of external snoops to the data cache that hit a valid line, +or the data line fill buffer, or one of the write back buffers. +.It Li p5-external-snoops +.Pq Event 07H +The number of external snoop requests accepted, including snoops that +hit in the code cache, the data cache and that hit in neither. +.It Li p5-floating-point-stalls-duration +.Pq Event 32H , Tn Pentium MMX +The number of cycles the pipeline is stalled due to a floating point +freeze. +This event is only allocated on counter 0. +.It Li p5-flops +.Pq Event 22H +The number of floating point adds, subtracts, multiples, divides and +square roots. +Transcendental instructions trigger this event multiple times. +Instructions generating divide-by-zero, negative square root, special +operand and stack exceptions are not counted. +Integer multiply instructions that use the x87 FPU are counted. +.It Li p5-full-write-buffer-stall-duration-while-executing-mmx-instructions +.Pq Event 3BH , Tn Pentium MMX +The number of clocks the pipeline has stalled due to full write +buffers when executing MMX instructions. +This event is only allocated on counter 0. +.It Li p5-hardware-interrupts +.Pq Event 27H +The number of taken INTR and NMI interrupts. +.It Li p5-instructions-executed +.Pq Event 16H +The number of instructions executed. +Repeat prefixed instructions are counted only once. +The HLT instruction is counted only once, irrespective of the number +of cycles spent in the halted state. +All hardware and software exceptions are counted as instructions, and +fault handler invocations are also counted as instructions. +.It Li p5-instructions-executed-v-pipe +.Pq Event 17H +The number of instructions that executed in the V pipe. +.It Li p5-io-read-or-write-cycle +.Pq Event 1DH +The number of bus cycles directed to I/O space. +.It Li p5-locked-bus-cycle +.Pq Event 1CH +The number of locked bus cycles that occur on account of the lock +prefixes, LOCK instructions, page table updates and descriptor table +updates. +.It Li p5-memory-accesses-in-both-pipes +.Pq Event 09H +The number of data memory reads or writes that are paired in both pipes. +.It Li p5-misaligned-data-memory-or-io-references +.Pq Event 0BH +The number of memory or I/O reads or writes that are not aligned on +natural boundaries. +2- and 4-byte accesses are counted as misaligned if they cross a 4 +byte boundary. +.It Li p5-misaligned-data-memory-reference-on-mmx-instructions +.Pq Event 36H , Tn Pentium MMX +The number of misaligned data memory references when executing MMX +instructions. +This event is only allocated on counter 0. +.It Li p5-mispredicted-or-unpredicted-returns +.Pq Event 37H , Tn Pentium MMX +The number of returns predicted incorrectly or not at all, only +counting RET instructions. +This event is only allocated on counter 0. +.It Li p5-mmx-instruction-data-read-misses +.Pq Event 31H , Tn Pentium MMX +The number of MMX instruction data read misses. +This event is only allocated on counter 1. +.It Li p5-mmx-instruction-data-reads +.Pq Event 31H , Tn Pentium MMX +The number of MMX instruction data reads. +This event is only allocated on counter 0. +.It Li p5-mmx-instruction-data-write-misses +.Pq Event 34H , Tn Pentium MMX +The number of data write misses caused by MMX instructions. +This event is only allocated on counter 1. +.It Li p5-mmx-instruction-data-writes +.Pq Event 34H , Tn Pentium MMX +The number of data writes caused by MMX instructions. +This event is only allocated on counter 0. +.It Li p5-mmx-instructions-executed-u-pipe +.Pq Event 2BH , Tn Pentium MMX +The number of MMX instructions executed in the U pipe. +This event is only allocated on counter 0. +.It Li p5-mmx-instructions-executed-v-pipe +.Pq Event 2BH , Tn Pentium MMX +The number of MMX instructions executed in the V pipe. +This event is only allocated on counter 1. +.It Li p5-mmx-multiply-unit-interlock +.Pq Event 38H , Tn Pentium MMX +The number of clocks the pipeline is stalled because the destination +of a prior MMX multiply is not ready. +This event is only allocated on counter 0. +.It Li p5-movd-movq-store-stall-due-to-previous-mmx-operation +.Pq Event 38H , Tn Pentium MMX +The number of clocks a MOVD/MOVQ instruction stalled in the D2 stage +of the pipeline due to a previous MMX instruction. +This event is only allocated on counter 1. +.It Li p5-noncacheable-memory-reads +.Pq Event 1EH +The number of bus cycles for non-cacheable instruction or data reads, +including cycles caused by TLB misses. +.It Li p5-number-of-cycles-not-in-halt-state +.Pq Event 30H , Tn Pentium MMX +The number of cycles the processor is not idle due to the HLT +instruction. +This event is only allocated on counter 0. +.It Li p5-pipeline-agi-stalls +.Pq Event 1FH +The number of address generation interlock stalls. +An AGI that occurs in both the U and V pipelines in the same clock +signals the event twice. +.It Li p5-pipeline-flushes +.Pq Event 15H +The number of pipeline flushes that occur. +Pipeline flushes are caused by branch mispredicts, exceptions, +interrupts, some segment register loads, and BTB misses. +Prefetch queue flushes due to serializing instructions are not +counted. +.It Li p5-pipeline-flushes-due-to-wrong-branch-predictions +.Pq Event 35H , Tn Pentium MMX +The number of pipeline flushes due to wrong branch predictions +resolved in either the E- or WB- stage of the pipeline. +This event is only allocated on counter 0. +.It Li p5-pipeline-flushes-due-to-wrong-branch-predictions-resolved-in-wb-stage +.Pq Event 35H , Tn Pentium MMX +The number of pipeline flushes due to wrong branch predictions +resolved in the stage of the pipeline. +This event is only allocated on counter 1. +.It Li p5-pipeline-stall-for-mmx-instruction-data-memory-reads +.Pq Event 36H , Tn Pentium MMX +The number of clocks during pipeline stalls caused by waiting MMX data +memory reads. +This event is only allocated on counter 1. +.It Li p5-predicted-returns +.Pq Event 37H , Tn Pentium MMX +The number of predicted returns, whether correct or incorrect. +This counter only counts RET instructions. +This event is only allocated on counter 1. +.It Li p5-returns +.Pq Event 39H , Tn Pentium MMX +The number of RET instructions executed. +This event is only allocated on counter 0. +.It Li p5-saturating-mmx-instructions-executed +.Pq Event 2FH , Tn Pentium MMX +The number of saturating MMX instructions executed. +This event is only allocated on counter 0. +.It Li p5-saturations-performed +.Pq Event 2FH , Tn Pentium MMX +The number of saturating MMX instructions executed when at least one +of its results were actually saturated. +This event is only allocated on counter 1. +.It Li p5-stall-on-mmx-instruction-write-to-e-o-m-state-line +.Pq Event 3BH , Tn Pentium MMX +The number of clocks during stalls on MMX instructions writing to +E- or M- state cache lines. +This event is only allocated on counter 1. +.It Li p5-stall-on-write-to-an-e-or-m-state-line +.Pq Event 1BH +The number of stalls on a write to an exclusive or modified data cache +line. +.It Li p5-taken-branch-or-btb-hit +.Pq Event 14H +The number of events that may cause a hit in the BTB, namely either +taken branches or BTB hits. +.It Li p5-taken-branches +.Pq Event 32H , Tn Pentium MMX +The number of taken branches. +This event is only allocated on counter 1. +.It Li p5-transitions-between-mmx-and-fp-instructions +.Pq Event 2DH , Tn Pentium MMX +The number of transitions between MMX and floating-point instructions +and vice-versa. +This event is only allocated on counter 1. +.It Li p5-waiting-for-data-memory-read-stall-duration +.Pq Event 1AH +The number of clocks the pipeline was stalled waiting for data +memory reads. +Data TLB misses processing is included in this count. +.It Li p5-write-buffer-full-stall-duration +.Pq Event 19H +The number of clocks while the pipeline was stalled due to write +buffers being full. +.It Li p5-write-hit-to-m-or-e-state-lines +.Pq Event 05H +The number of writes that hit exclusive or modified lines in the data +cache. +.It Li p5-writes-to-noncacheable-memory +.Pq Event 2EH , Tn Pentium MMX +The number of writes to non-cacheable memory, including write cycles +caused by TLB misses and I/O writes. +This event is only allocated on counter 1. +.El +.Ss Event Name Aliases +The following table shows the mapping between the PMC-independent +aliases supported by +.Lb libpmc +and the underlying hardware events used. +.Bl -column "branch-mispredicts" "Description" +.It Em Alias Ta Em Event +.It Li branches Ta Li p5-taken-branches +.It Li branch-mispredicts Ta Li (unsupported) +.It Li dc-misses Ta Li p5-data-read-miss-or-write-miss +.It Li ic-misses Ta Li p5-code-cache-miss +.It Li instructions Ta Li p5-instructions-executed +.It Li interrupts Ta Li p5-hardware-interrupts +.It Li unhalted-cycles Ta Li p5-number-of-cycles-not-in-halt-state +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.core2 3 , +.Xr pmc.iaf 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p6 3 , +.Xr pmc.tsc 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . diff --git a/lib/libpmc/pmc.p6.3 b/lib/libpmc/pmc.p6.3 new file mode 100644 index 0000000..d8cde64 --- /dev/null +++ b/lib/libpmc/pmc.p6.3 @@ -0,0 +1,1026 @@ +.\" Copyright (c) 2003-2008 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd October 4, 2008 +.Dt PMC.P6 3 +.Os +.Sh NAME +.Nm pmc.p6 +.Nd measurement events for +.Tn Intel +Pentium Pro, P-II, P-III family CPUs +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +Intel P6 PMCs are present in Intel +.Tn "Pentium Pro" , +.Tn "Pentium II" , +.Tn Celeron , +.Tn "Pentium III" +and +.Tn "Pentium M" +processors. +.Pp +They are documented in +.Rs +.%B "IA-32 Intel(R) Architecture Software Developer's Manual" +.%T "Volume 3: System Programming Guide" +.%N "Order Number 245472-012" +.%D 2003 +.%Q "Intel Corporation" +.Re +.Pp +Some of these events are affected by processor errata described in +.Rs +.%B "Intel(R) Pentium(R) III Processor Specification Update" +.%N "Document Number: 244453-054" +.%D "April 2005" +.%Q "Intel Corporation" +.Re +.Ss PMC Features +These CPUs have two counters, each 40 bits wide. +Some events may only be used on specific counters and some events are +defined only on specific processor models. +These PMCs support the following capabilities: +.Bl -column "PMC_CAP_INTERRUPT" "Support" +.It Em Capability Ta Em Support +.It PMC_CAP_CASCADE Ta \&No +.It PMC_CAP_EDGE Ta Yes +.It PMC_CAP_INTERRUPT Ta Yes +.It PMC_CAP_INVERT Ta Yes +.It PMC_CAP_READ Ta Yes +.It PMC_CAP_PRECISE Ta \&No +.It PMC_CAP_SYSTEM Ta Yes +.It PMC_CAP_TAGGING Ta \&No +.It PMC_CAP_THRESHOLD Ta Yes +.It PMC_CAP_USER Ta Yes +.It PMC_CAP_WRITE Ta Yes +.El +.Ss Event Qualifiers +Event specifiers for Intel P6 PMCs can have the following common +qualifiers: +.Bl -tag -width indent +.It Li cmask= Ns Ar value +Configure the PMC to increment only if the number of configured +events measured in a cycle is greater than or equal to +.Ar value . +.It Li edge +Configure the PMC to count the number of de-asserted to asserted +transitions of the conditions expressed by the other qualifiers. +If specified, the counter will increment only once whenever a +condition becomes true, irrespective of the number of clocks during +which the condition remains true. +.It Li inv +Invert the sense of comparison when the +.Dq Li cmask +qualifier is present, making the counter increment when the number of +events per cycle is less than the value specified by the +.Dq Li cmask +qualifier. +.It Li os +Configure the PMC to count events happening at processor privilege +level 0. +.It Li umask= Ns Ar value +This qualifier is used to further qualify the event selected (see +below). +.It Li usr +Configure the PMC to count events occurring at privilege levels 1, 2 +or 3. +.El +.Pp +If neither of the +.Dq Li os +or +.Dq Li usr +qualifiers are specified, the default is to enable both. +.Pp +The event specifiers supported by Intel P6 PMCs are: +.Bl -tag -width indent +.It Li p6-baclears +.Pq Event E6H +Count the number of times a static branch prediction was made by the +branch decoder because the BTB did not have a prediction. +.It Li p6-br-bac-missp-exec +.Pq Event 8AH , Tn "Pentium M" +Count the number of branch instructions executed that where +mispredicted at the Front End (BAC). +.It Li p6-br-bogus +.Pq Event E4H +Count the number of bogus branches. +.It Li p6-br-call-exec +.Pq Event 92H , Tn "Pentium M" +Count the number of call instructions executed. +.It Li p6-br-call-missp-exec +.Pq Event 93H , Tn "Pentium M" +Count the number of call instructions executed that were mispredicted. +.It Li p6-br-cnd-exec +.Pq Event 8BH , Tn "Pentium M" +Count the number of conditional branch instructions executed. +.It Li p6-br-cnd-missp-exec +.Pq Event 8CH , Tn "Pentium M" +Count the number of conditional branch instructions executed that were +mispredicted. +.It Li p6-br-ind-call-exec +.Pq Event 94H , Tn "Pentium M" +Count the number of indirect call instructions executed. +.It Li p6-br-ind-exec +.Pq Event 8DH , Tn "Pentium M" +Count the number of indirect branch instructions executed. +.It Li p6-br-ind-missp-exec +.Pq Event 8EH , Tn "Pentium M" +Count the number of indirect branch instructions executed that were +mispredicted. +.It Li p6-br-inst-decoded +.Pq Event E0H +Count the number of branch instructions decoded. +.It Li p6-br-inst-exec +.Pq Event 88H , Tn "Pentium M" +Count the number of branch instructions executed but necessarily retired. +.It Li p6-br-inst-retired +.Pq Event C4H +Count the number of branch instructions retired. +.It Li p6-br-miss-pred-retired +.Pq Event C5H +Count the number of mispredicted branch instructions retired. +.It Li p6-br-miss-pred-taken-ret +.Pq Event C9H +Count the number of taken mispredicted branches retired. +.It Li p6-br-missp-exec +.Pq Event 89H , Tn "Pentium M" +Count the number of branch instructions executed that were +mispredicted at execution. +.It Li p6-br-ret-bac-missp-exec +.Pq Event 91H , Tn "Pentium M" +Count the number of return instructions executed that were +mispredicted at the Front End (BAC). +.It Li p6-br-ret-exec +.Pq Event 8FH , Tn "Pentium M" +Count the number of return instructions executed. +.It Li p6-br-ret-missp-exec +.Pq Event 90H , Tn "Pentium M" +Count the number of return instructions executed that were +mispredicted at execution. +.It Li p6-br-taken-retired +.Pq Event C9H +Count the number of taken branches retired. +.It Li p6-btb-misses +.Pq Event E2H +Count the number of branches for which the BTB did not produce a +prediction. +.It Li p6-bus-bnr-drv +.Pq Event 61H +Count the number of bus clock cycles during which this processor is +driving the BNR# pin. +.It Li p6-bus-data-rcv +.Pq Event 64H +Count the number of bus clock cycles during which this processor is +receiving data. +.It Li p6-bus-drdy-clocks Op Li ,umask= Ns Ar qualifier +.Pq Event 62H +Count the number of clocks during which DRDY# is asserted. +An additional qualifier may be specified, and comprises one of the +following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li any +Count transactions generated by any agent on the bus. +.It Li self +Count transactions generated by this processor. +.El +.Pp +The default is to count operations generated by this processor. +.It Li p6-bus-hit-drv +.Pq Event 7AH +Count the number of bus clock cycles during which this processor is +driving the HIT# pin. +.It Li p6-bus-hitm-drv +.Pq Event 7BH +Count the number of bus clock cycles during which this processor is +driving the HITM# pin. +.It Li p6-bus-lock-clocks Op Li ,umask= Ns Ar qualifier +.Pq Event 63H +Count the number of clocks during with LOCK# is asserted on the +external system bus. +An additional qualifier may be specified and comprises one of the following +keywords: +.Pp +.Bl -tag -width indent -compact +.It Li any +Count transactions generated by any agent on the bus. +.It Li self +Count transactions generated by this processor. +.El +.Pp +The default is to count operations generated by this processor. +.It Li p6-bus-req-outstanding +.Pq Event 60H +Count the number of bus requests outstanding in any given cycle. +.It Li p6-bus-snoop-stall +.Pq Event 7EH +Count the number of clock cycles during which the bus is snoop stalled. +.It Li p6-bus-tran-any Op Li ,umask= Ns Ar qualifier +.Pq Event 70H +Count the number of completed bus transactions of any kind. +An additional qualifier may be specified and comprises one of the following +keywords: +.Pp +.Bl -tag -width indent -compact +.It Li any +Count transactions generated by any agent on the bus. +.It Li self +Count transactions generated by this processor. +.El +.Pp +The default is to count operations generated by this processor. +.It Li p6-bus-tran-brd Op Li ,umask= Ns Ar qualifier +.Pq Event 65H +Count the number of burst read transactions. +An additional qualifier may be specified and comprises one of the following +keywords: +.Pp +.Bl -tag -width indent -compact +.It Li any +Count transactions generated by any agent on the bus. +.It Li self +Count transactions generated by this processor. +.El +.Pp +The default is to count operations generated by this processor. +.It Li p6-bus-tran-burst Op Li ,umask= Ns Ar qualifier +.Pq Event 6EH +Count the number of completed burst transactions. +An additional qualifier may be specified and comprises one of the following +keywords: +.Pp +.Bl -tag -width indent -compact +.It Li any +Count transactions generated by any agent on the bus. +.It Li self +Count transactions generated by this processor. +.El +.Pp +The default is to count operations generated by this processor. +.It Li p6-bus-tran-def Op Li ,umask= Ns Ar qualifier +.Pq Event 6DH +Count the number of completed deferred transactions. +An additional qualifier may be specified and comprises one of the following +keywords: +.Pp +.Bl -tag -width indent -compact +.It Li any +Count transactions generated by any agent on the bus. +.It Li self +Count transactions generated by this processor. +.El +.Pp +The default is to count operations generated by this processor. +.It Li p6-bus-tran-ifetch Op Li ,umask= Ns Ar qualifier +.Pq Event 68H +Count the number of completed instruction fetch transactions. +An additional qualifier may be specified and comprises one of the following +keywords: +.Pp +.Bl -tag -width indent -compact +.It Li any +Count transactions generated by any agent on the bus. +.It Li self +Count transactions generated by this processor. +.El +.Pp +The default is to count operations generated by this processor. +.It Li p6-bus-tran-inval Op Li ,umask= Ns Ar qualifier +.Pq Event 69H +Count the number of completed invalidate transactions. +An additional qualifier may be specified and comprises one of the following +keywords: +.Pp +.Bl -tag -width indent -compact +.It Li any +Count transactions generated by any agent on the bus. +.It Li self +Count transactions generated by this processor. +.El +.Pp +The default is to count operations generated by this processor. +.It Li p6-bus-tran-mem Op Li ,umask= Ns Ar qualifier +.Pq Event 6FH +Count the number of completed memory transactions. +An additional qualifier may be specified and comprises one of the following +keywords: +.Pp +.Bl -tag -width indent -compact +.It Li any +Count transactions generated by any agent on the bus. +.It Li self +Count transactions generated by this processor. +.El +.Pp +The default is to count operations generated by this processor. +.It Li p6-bus-tran-pwr Op Li ,umask= Ns Ar qualifier +.Pq Event 6AH +Count the number of completed partial write transactions. +An additional qualifier may be specified and comprises one of the following +keywords: +.Pp +.Bl -tag -width indent -compact +.It Li any +Count transactions generated by any agent on the bus. +.It Li self +Count transactions generated by this processor. +.El +.Pp +The default is to count operations generated by this processor. +.It Li p6-bus-tran-rfo Op Li ,umask= Ns Ar qualifier +.Pq Event 66H +Count the number of completed read-for-ownership transactions. +An additional qualifier may be specified and comprises one of the following +keywords: +.Pp +.Bl -tag -width indent -compact +.It Li any +Count transactions generated by any agent on the bus. +.It Li self +Count transactions generated by this processor. +.El +.Pp +The default is to count operations generated by this processor. +.It Li p6-bus-trans-io Op Li ,umask= Ns Ar qualifier +.Pq Event 6CH +Count the number of completed I/O transactions. +An additional qualifier may be specified and comprises one of the following +keywords: +.Pp +.Bl -tag -width indent -compact +.It Li any +Count transactions generated by any agent on the bus. +.It Li self +Count transactions generated by this processor. +.El +.Pp +The default is to count operations generated by this processor. +.It Li p6-bus-trans-p Op Li ,umask= Ns Ar qualifier +.Pq Event 6BH +Count the number of completed partial transactions. +An additional qualifier may be specified and comprises one of the following +keywords: +.Pp +.Bl -tag -width indent -compact +.It Li any +Count transactions generated by any agent on the bus. +.It Li self +Count transactions generated by this processor. +.El +.Pp +The default is to count operations generated by this processor. +.It Li p6-bus-trans-wb Op Li ,umask= Ns Ar qualifier +.Pq Event 67H +Count the number of completed write-back transactions. +An additional qualifier may be specified and comprises one of the following +keywords: +.Pp +.Bl -tag -width indent -compact +.It Li any +Count transactions generated by any agent on the bus. +.It Li self +Count transactions generated by this processor. +.El +.Pp +The default is to count operations generated by this processor. +.It Li p6-cpu-clk-unhalted +.Pq Event 79H +Count the number of cycles during with the processor was not halted. +.Pp +.Pq Tn "Pentium M" +Count the number of cycles during with the processor was not halted +and not in a thermal trip. +.It Li p6-cycles-div-busy +.Pq Event 14H +Count the number of cycles during which the divider is busy and cannot +accept new divides. +This event is only allocated on counter 0. +.It Li p6-cycles-int-pending-and-masked +.Pq Event C7H +Count the number of processor cycles for which interrupts were +disabled and interrupts were pending. +.It Li p6-cycles-int-masked +.Pq Event C6H +Count the number of processor cycles for which interrupts were +disabled. +.It Li p6-data-mem-refs +.Pq Event 43H +Count all loads and all stores using any memory type, including +internal retries. +Each part of a split store is counted separately. +.It Li p6-dcu-lines-in +.Pq Event 45H +Count the total lines allocated in the data cache unit. +.It Li p6-dcu-m-lines-in +.Pq Event 46H +Count the number of M state lines allocated in the data cache unit. +.It Li p6-dcu-m-lines-out +.Pq Event 47H +Count the number of M state lines evicted from the data cache unit. +.It Li p6-dcu-miss-outstanding +.Pq Event 48H +Count the weighted number of cycles while a data cache unit miss is +outstanding, incremented by the number of outstanding cache misses at +any time. +.It Li p6-div +.Pq Event 13H +Count the number of integer and floating-point divides including +speculative divides. +This event is only allocated on counter 1. +.It Li p6-emon-esp-uops +.Pq Event D7H , Tn "Pentium M" +Count the total number of micro-ops. +.It Li p6-emon-est-trans Op Li ,umask= Ns Ar qualifier +.Pq Event 58H , Tn "Pentium M" +Count the number of +.Tn "Enhanced Intel SpeedStep" +transitions. +An additional qualifier may be specified, and can be one of the +following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li all +Count all transitions. +.It Li freq +Count only frequency transitions. +.El +.Pp +The default is to count all transitions. +.It Li p6-emon-fused-uops-ret Op Li ,umask= Ns Ar qualifier +.Pq Event DAH , Tn "Pentium M" +Count the number of retired fused micro-ops. +An additional qualifier may be specified, and may be one of the +following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li all +Count all fused micro-ops. +.It Li loadop +Count only load and op micro-ops. +.It Li stdsta +Count only STD/STA micro-ops. +.El +.Pp +The default is to count all fused micro-ops. +.It Li p6-emon-kni-comp-inst-ret +.Pq Event D9H , Tn "Pentium III" +Count the number of SSE computational instructions retired. +An additional qualifier may be specified, and comprises one of the +following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li packed-and-scalar +Count packed and scalar operations. +.It Li scalar +Count scalar operations only. +.El +.Pp +The default is to count packed and scalar operations. +.It Li p6-emon-kni-inst-retired Op Li ,umask= Ns Ar qualifier +.Pq Event D8H , Tn "Pentium III" +Count the number of SSE instructions retired. +An additional qualifier may be specified, and comprises one of the +following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li packed-and-scalar +Count packed and scalar operations. +.It Li scalar +Count scalar operations only. +.El +.Pp +The default is to count packed and scalar operations. +.It Li p6-emon-kni-pref-dispatched Op Li ,umask= Ns Ar qualifier +.Pq Event 07H , Tn "Pentium III" +Count the number of SSE prefetch or weakly ordered instructions +dispatched (including speculative prefetches). +An additional qualifier may be specified, and comprises one of the +following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li nta +Count non-temporal prefetches. +.It Li t1 +Count prefetches to L1. +.It Li t2 +Count prefetches to L2. +.It Li wos +Count weakly ordered stores. +.El +.Pp +The default is to count non-temporal prefetches. +.It Li p6-emon-kni-pref-miss Op Li ,umask= Ns Ar qualifier +.Pq Event 4BH , Tn "Pentium III" +Count the number of prefetch or weakly ordered instructions that miss +all caches. +An additional qualifier may be specified, and comprises one of the +following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li nta +Count non-temporal prefetches. +.It Li t1 +Count prefetches to L1. +.It Li t2 +Count prefetches to L2. +.It Li wos +Count weakly ordered stores. +.El +.Pp +The default is to count non-temporal prefetches. +.It Li p6-emon-pref-rqsts-dn +.Pq Event F8H , Tn "Pentium M" +Count the number of downward prefetches issued. +.It Li p6-emon-pref-rqsts-up +.Pq Event F0H , Tn "Pentium M" +Count the number of upward prefetches issued. +.It Li p6-emon-simd-instr-retired +.Pq Event CEH , Tn "Pentium M" +Count the number of retired +.Tn MMX +instructions. +.It Li p6-emon-sse-sse2-comp-inst-retired Op Li ,umask= Ns Ar qualifier +.Pq Event D9H , Tn "Pentium M" +Count the number of computational SSE instructions retired. +An additional qualifier may be specified and can be one of the +following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li sse-packed-single +Count SSE packed-single instructions. +.It Li sse-scalar-single +Count SSE scalar-single instructions. +.It Li sse2-packed-double +Count SSE2 packed-double instructions. +.It Li sse2-scalar-double +Count SSE2 scalar-double instructions. +.El +.Pp +The default is to count SSE packed-single instructions. +.It Li p6-emon-sse-sse2-inst-retired Op Li ,umask= Ns Ar qualifier +.Pq Event D8H , Tn "Pentium M" +Count the number of SSE instructions retired. +An additional qualifier can be specified, and can be one of the +following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li sse-packed-single +Count SSE packed-single instructions. +.It Li sse-packed-single-scalar-single +Count SSE packed-single and scalar-single instructions. +.It Li sse2-packed-double +Count SSE2 packed-double instructions. +.It Li sse2-scalar-double +Count SSE2 scalar-double instructions. +.El +.Pp +The default is to count SSE packed-single instructions. +.It Li p6-emon-synch-uops +.Pq Event D3H , Tn "Pentium M" +Count the number of sync micro-ops. +.It Li p6-emon-thermal-trip +.Pq Event 59H , Tn "Pentium M" +Count the duration or occurrences of thermal trips. +Use the +.Dq Li edge +qualifier to count occurrences of thermal trips. +.It Li p6-emon-unfusion +.Pq Event DBH , Tn "Pentium M" +Count the number of unfusion events in the reorder buffer. +.It Li p6-flops +.Pq Event C1H +Count the number of computational floating point operations retired. +This event is only allocated on counter 0. +.It Li p6-fp-assist +.Pq Event 11H +Count the number of floating point exceptions handled by microcode. +This event is only allocated on counter 1. +.It Li p6-fp-comps-ops-exe +.Pq Event 10H +Count the number of computation floating point operations executed. +This event is only allocated on counter 0. +.It Li p6-fp-mmx-trans Op Li ,umask= Ns Ar qualifier +.Pq Event CCH , Tn "Pentium II" , Tn "Pentium III" +Count the number of transitions between MMX and floating-point +instructions. +An additional qualifier may be specified, and comprises one of the +following keywords: +.Pp +.Bl -tag -width indent -compact +.It Li mmxtofp +Count transitions from MMX instructions to floating-point instructions. +.It Li fptommx +Count transitions from floating-point instructions to MMX instructions. +.El +.Pp +The default is to count MMX to floating-point transitions. +.It Li p6-hw-int-rx +.Pq Event C8H +Count the number of hardware interrupts received. +.It Li p6-ifu-ifetch +.Pq Event 80H +Count the number of instruction fetches, both cacheable and non-cacheable. +.It Li p6-ifu-ifetch-miss +.Pq Event 81H +Count the number of instruction fetch misses (i.e., those that produce +memory accesses). +.It Li p6-ifu-mem-stall +.Pq Event 86H +Count the number of cycles instruction fetch is stalled for any reason. +.It Li p6-ild-stall +.Pq Event 87H +Count the number of cycles the instruction length decoder is stalled. +.It Li p6-inst-decoded +.Pq Event D0H +Count the number of instructions decoded. +.It Li p6-inst-retired +.Pq Event C0H +Count the number of instructions retired. +.It Li p6-itlb-miss +.Pq Event 85H +Count the number of instruction TLB misses. +.It Li p6-l2-ads +.Pq Event 21H +Count the number of L2 address strobes. +.It Li p6-l2-dbus-busy +.Pq Event 22H +Count the number of cycles during which the L2 cache data bus was busy. +.It Li p6-l2-dbus-busy-rd +.Pq Event 23H +Count the number of cycles during which the L2 cache data bus was busy +transferring read data from L2 to the processor. +.It Li p6-l2-ifetch Op Li ,umask= Ns Ar qualifier +.Pq Event 28H +Count the number of L2 instruction fetches. +An additional qualifier may be specified and comprises a list of the following +keywords separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li e +Count operations affecting E (exclusive) state lines. +.It Li i +Count operations affecting I (invalid) state lines. +.It Li m +Count operations affecting M (modified) state lines. +.It Li s +Count operations affecting S (shared) state lines. +.El +.Pp +The default is to count operations affecting all (MESI) state lines. +.It Li p6-l2-ld Op Li ,umask= Ns Ar qualifier +.Pq Event 29H +Count the number of L2 data loads. +An additional qualifier may be specified and comprises a list of the following +keywords separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li both +.Pq Tn "Pentium M" +Count both hardware-prefetched lines and non-hardware-prefetched lines. +.It Li e +Count operations affecting E (exclusive) state lines. +.It Li hw +.Pq Tn "Pentium M" +Count hardware-prefetched lines only. +.It Li i +Count operations affecting I (invalid) state lines. +.It Li m +Count operations affecting M (modified) state lines. +.It Li nonhw +.Pq Tn "Pentium M" +Exclude hardware-prefetched lines. +.It Li s +Count operations affecting S (shared) state lines. +.El +.Pp +The default on processors other than +.Tn "Pentium M" +processors is to count operations affecting all (MESI) state lines. +The default on +.Tn "Pentium M" +processors is to count both hardware-prefetched and +non-hardware-prefetch operations on all (MESI) state lines. +.Pq Errata +This event is affected by processor errata E53. +.It Li p6-l2-lines-in Op Li ,umask= Ns Ar qualifier +.Pq Event 24H +Count the number of L2 lines allocated. +An additional qualifier may be specified and comprises a list of the following +keywords separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li both +.Pq Tn "Pentium M" +Count both hardware-prefetched lines and non-hardware-prefetched lines. +.It Li e +Count operations affecting E (exclusive) state lines. +.It Li hw +.Pq Tn "Pentium M" +Count hardware-prefetched lines only. +.It Li i +Count operations affecting I (invalid) state lines. +.It Li m +Count operations affecting M (modified) state lines. +.It Li nonhw +.Pq Tn "Pentium M" +Exclude hardware-prefetched lines. +.It Li s +Count operations affecting S (shared) state lines. +.El +.Pp +The default on processors other than +.Tn "Pentium M" +processors is to count operations affecting all (MESI) state lines. +The default on +.Tn "Pentium M" +processors is to count both hardware-prefetched and +non-hardware-prefetch operations on all (MESI) state lines. +.Pq Errata +This event is affected by processor errata E45. +.It Li p6-l2-lines-out Op Li ,umask= Ns Ar qualifier +.Pq Event 26H +Count the number of L2 lines evicted. +An additional qualifier may be specified and comprises a list of the following +keywords separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li both +.Pq Tn "Pentium M" +Count both hardware-prefetched lines and non-hardware-prefetched lines. +.It Li e +Count operations affecting E (exclusive) state lines. +.It Li hw +.Pq Tn "Pentium M" +Count hardware-prefetched lines only. +.It Li i +Count operations affecting I (invalid) state lines. +.It Li m +Count operations affecting M (modified) state lines. +.It Li nonhw +.Pq Tn "Pentium M" only +Exclude hardware-prefetched lines. +.It Li s +Count operations affecting S (shared) state lines. +.El +.Pp +The default on processors other than +.Tn "Pentium M" +processors is to count operations affecting all (MESI) state lines. +The default on +.Tn "Pentium M" +processors is to count both hardware-prefetched and +non-hardware-prefetch operations on all (MESI) state lines. +.Pq Errata +This event is affected by processor errata E45. +.It Li p6-l2-m-lines-inm +.Pq Event 25H +Count the number of modified lines allocated in L2 cache. +.It Li p6-l2-m-lines-outm Op Li ,umask= Ns Ar qualifier +.Pq Event 27H +Count the number of L2 M-state lines evicted. +.Pp +.Pq Tn "Pentium M" +On these processors an additional qualifier may be specified and +comprises a list of the following keywords separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li both +Count both hardware-prefetched lines and non-hardware-prefetched lines. +.It Li hw +Count hardware-prefetched lines only. +.It Li nonhw +Exclude hardware-prefetched lines. +.El +.Pp +The default is to count both hardware-prefetched and +non-hardware-prefetch operations. +.Pq Errata +This event is affected by processor errata E53. +.It Li p6-l2-rqsts Op Li ,umask= Ns Ar qualifier +.Pq Event 2EH +Count the total number of L2 requests. +An additional qualifier may be specified and comprises a list of the following +keywords separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li e +Count operations affecting E (exclusive) state lines. +.It Li i +Count operations affecting I (invalid) state lines. +.It Li m +Count operations affecting M (modified) state lines. +.It Li s +Count operations affecting S (shared) state lines. +.El +.Pp +The default is to count operations affecting all (MESI) state lines. +.It Li p6-l2-st +.Pq Event 2AH +Count the number of L2 data stores. +An additional qualifier may be specified and comprises a list of the following +keywords separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li e +Count operations affecting E (exclusive) state lines. +.It Li i +Count operations affecting I (invalid) state lines. +.It Li m +Count operations affecting M (modified) state lines. +.It Li s +Count operations affecting S (shared) state lines. +.El +.Pp +The default is to count operations affecting all (MESI) state lines. +.It Li p6-ld-blocks +.Pq Event 03H +Count the number of load operations delayed due to store buffer blocks. +.It Li p6-misalign-mem-ref +.Pq Event 05H +Count the number of misaligned data memory references (crossing a 64 +bit boundary). +.It Li p6-mmx-assist +.Pq Event CDH , Tn "Pentium II" , Tn "Pentium III" +Count the number of MMX assists executed. +.It Li p6-mmx-instr-exec +.Pq Event B0H +.Pq Tn Celeron , Tn "Pentium II" +Count the number of MMX instructions executed, except MOVQ and MOVD +stores from register to memory. +.It Li p6-mmx-instr-ret +.Pq Event CEH , Tn "Pentium II" +Count the number of MMX instructions retired. +.It Li p6-mmx-instr-type-exec Op Li ,umask= Ns Ar qualifier +.Pq Event B3H , Tn "Pentium II" , Tn "Pentium III" +Count the number of MMX instructions executed. +An additional qualifier may be specified and comprises a list of +the following keywords separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li pack +Count MMX pack operation instructions. +.It Li packed-arithmetic +Count MMX packed arithmetic instructions. +.It Li packed-logical +Count MMX packed logical instructions. +.It Li packed-multiply +Count MMX packed multiply instructions. +.It Li packed-shift +Count MMX packed shift instructions. +.It Li unpack +Count MMX unpack operation instructions. +.El +.Pp +The default is to count all operations. +.It Li p6-mmx-sat-instr-exec +.Pq Event B1H , Tn "Pentium II" , Tn "Pentium III" +Count the number of MMX saturating instructions executed. +.It Li p6-mmx-uops-exec +.Pq Event B2H , Tn "Pentium II" , Tn "Pentium III" +Count the number of MMX micro-ops executed. +.It Li p6-mul +.Pq Event 12H +Count the number of integer and floating-point multiplies, including +speculative multiplies. +This event is only allocated on counter 1. +.It Li p6-partial-rat-stalls +.Pq Event D2H +Count the number of cycles or events for partial stalls. +.It Li p6-resource-stalls +.Pq Event A2H +Count the number of cycles there was a resource related stall of any kind. +.It Li p6-ret-seg-renames +.Pq Event D6H , Tn "Pentium II" , Tn "Pentium III" +Count the number of segment register rename events retired. +.It Li p6-sb-drains +.Pq Event 04H +Count the number of cycles the store buffer is draining. +.It Li p6-seg-reg-renames Op Li ,umask= Ns Ar qualifier +.Pq Event D5H , Tn "Pentium II" , Tn "Pentium III" +Count the number of segment register renames. +An additional qualifier may be specified, and comprises a list of the +following keywords separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li ds +Count renames for segment register DS. +.It Li es +Count renames for segment register ES. +.It Li fs +Count renames for segment register FS. +.It Li gs +Count renames for segment register GS. +.El +.Pp +The default is to count operations affecting all segment registers. +.It Li p6-seg-rename-stalls +.Pq Event D4H , Tn "Pentium II" , Tn "Pentium III" +Count the number of segment register renaming stalls. +An additional qualifier may be specified, and comprises a list of the +following keywords separated by +.Ql + +characters: +.Pp +.Bl -tag -width indent -compact +.It Li ds +Count stalls for segment register DS. +.It Li es +Count stalls for segment register ES. +.It Li fs +Count stalls for segment register FS. +.It Li gs +Count stalls for segment register GS. +.El +.Pp +The default is to count operations affecting all the segment registers. +.It Li p6-segment-reg-loads +.Pq Event 06H +Count the number of segment register loads. +.It Li p6-uops-retired +.Pq Event C2H +Count the number of micro-ops retired. +.El +.Ss Event Name Aliases +The following table shows the mapping between the PMC-independent +aliases supported by +.Lb libpmc +and the underlying hardware events used. +.Bl -column "branch-mispredicts" "Description" +.It Em Alias Ta Em Event +.It Li branches Ta Li p6-br-inst-retired +.It Li branch-mispredicts Ta Li p6-br-miss-pred-retired +.It Li dc-misses Ta Li p6-dcu-lines-in +.It Li ic-misses Ta Li p6-ifu-fetch-miss +.It Li instructions Ta Li p6-inst-retired +.It Li interrupts Ta Li p6-hw-int-rx +.It Li unhalted-cycles Ta Li p6-cpu-clk-unhalted +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.core2 3 , +.Xr pmc.iaf 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.tsc 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . diff --git a/lib/libpmc/pmc.tsc.3 b/lib/libpmc/pmc.tsc.3 new file mode 100644 index 0000000..144ff35 --- /dev/null +++ b/lib/libpmc/pmc.tsc.3 @@ -0,0 +1,83 @@ +.\" Copyright (c) 2003-2008 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd October 4, 2008 +.Dt PMC.TSC 3 +.Os +.Sh NAME +.Nm pmc.tsc +.Nd measurements using the i386 timestamp counter +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +In the i386 architecture, the timestamp counter is a monotonically +non-decreasing counter that counts processor cycles. +.Pp +This counter may be selected specifying an event specifier +.Dq Li tsc +to +.Xr pmc_allocate 3 . +The TSC is a read-only counter that may only be allocated in +system-wide counting mode. +The +.Dq Li tsc +event does not support further event qualifiers. +.Pp +Multiple processes are allowed to allocate the TSC. +Once allocated, the TSC may be read using the +.Fn pmc_read +function, or by using the +.Li RDTSC +instruction. +.Ss Event Name Aliases +The alias +.Dq Li cycles +maps to the TSC. +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.core2 3 , +.Xr pmc.iaf 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . diff --git a/lib/libpmc/pmc.ucf.3 b/lib/libpmc/pmc.ucf.3 new file mode 100644 index 0000000..5155eb6 --- /dev/null +++ b/lib/libpmc/pmc.ucf.3 @@ -0,0 +1,113 @@ +.\" Copyright (c) 2010 Fabien Thomas. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd March 30, 2010 +.Dt PMC.UCF 3 +.Os +.Sh NAME +.Nm pmc.ucf +.Nd measurement events for +.Tn Intel +uncore fixed function performance counters. +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +Each fixed-function PMC measures a specific hardware event. +The number of fixed-function PMCs implemented in a CPU can vary. +The number of fixed-function PMCs present can be determined at runtime +by using function +.Xr pmc_cpuinfo 3 . +.Pp +Intel uncore fixed-function PMCs are documented in +.Rs +.%B "Intel(R) 64 and IA-32 Architectures Software Developes Manual" +.%T "Volume 3B: System Programming Guide, Part 2" +.%N "Order Number: 253669-033US" +.%D December 2009 +.%Q "Intel Corporation" +.Re +.Pp +.Ss PMC Capabilities +Fixed-function PMCs support the following capabilities: +.Bl -column "PMC_CAP_INTERRUPT" "Support" +.It Em Capability Ta Em Support +.It PMC_CAP_CASCADE Ta \&No +.It PMC_CAP_EDGE Ta \&No +.It PMC_CAP_INTERRUPT Ta \&No +.It PMC_CAP_INVERT Ta \&No +.It PMC_CAP_READ Ta Yes +.It PMC_CAP_PRECISE Ta \&No +.It PMC_CAP_SYSTEM Ta \&No +.It PMC_CAP_TAGGING Ta \&No +.It PMC_CAP_THRESHOLD Ta \&No +.It PMC_CAP_USER Ta \&No +.It PMC_CAP_WRITE Ta Yes +.El +.Ss Class Name Prefix +These PMCs are named using a class name prefix of +.Dq Li ucf- . +.Ss Event Specifiers (Fixed Function PMCs) +The fixed function PMCs are selectable using the following +event names: +.Bl -tag -width indent +.It Li UCLOCK +.Pq Fixed Function Counter 0 +The fixed-function uncore counter increments at the rate of the U-clock. +The frequency of the uncore clock domain can be determined from the uncore +clock ratio which is available in the PCI configuration space register at +offset C0H under device number 0 and Function 0. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.core2 3 , +.Xr pmc.iaf 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmc.corei7 3 , +.Xr pmc.corei7uc 3 , +.Xr pmc.westmere 3 , +.Xr pmc.westmereuc 3 , +.Xr pmc.tsc 3 , +.Xr pmc_cpuinfo 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . diff --git a/lib/libpmc/pmc.westmere.3 b/lib/libpmc/pmc.westmere.3 new file mode 100644 index 0000000..bd0244e --- /dev/null +++ b/lib/libpmc/pmc.westmere.3 @@ -0,0 +1,1329 @@ +.\" Copyright (c) 2010 Fabien Thomas. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd March 24, 2010 +.Dt PMC.WESTMERE 3 +.Os +.Sh NAME +.Nm pmc.westmere +.Nd measurement events for +.Tn Intel +.Tn Westmere +family CPUs +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +.Tn Intel +.Tn "Westmere" +CPUs contain PMCs conforming to version 2 of the +.Tn Intel +performance measurement architecture. +These CPUs may contain up to three classes of PMCs: +.Bl -tag -width "Li PMC_CLASS_IAP" +.It Li PMC_CLASS_IAF +Fixed-function counters that count only one hardware event per counter. +.It Li PMC_CLASS_IAP +Programmable counters that may be configured to count one of a defined +set of hardware events. +.El +.Pp +The number of PMCs available in each class and their widths need to be +determined at run time by calling +.Xr pmc_cpuinfo 3 . +.Pp +Intel Westmere PMCs are documented in +.Rs +.%B "Intel(R) 64 and IA-32 Architectures Software Developes Manual" +.%T "Volume 3B: System Programming Guide, Part 2" +.%N "Order Number: 253669-033US" +.%D December 2009 +.%Q "Intel Corporation" +.Re +.Ss WESTMERE FIXED FUNCTION PMCS +These PMCs and their supported events are documented in +.Xr pmc.iaf 3 . +.Ss WESTMERE PROGRAMMABLE PMCS +The programmable PMCs support the following capabilities: +.Bl -column "PMC_CAP_INTERRUPT" "Support" +.It Em Capability Ta Em Support +.It PMC_CAP_CASCADE Ta \&No +.It PMC_CAP_EDGE Ta Yes +.It PMC_CAP_INTERRUPT Ta Yes +.It PMC_CAP_INVERT Ta Yes +.It PMC_CAP_READ Ta Yes +.It PMC_CAP_PRECISE Ta \&No +.It PMC_CAP_SYSTEM Ta Yes +.It PMC_CAP_TAGGING Ta \&No +.It PMC_CAP_THRESHOLD Ta Yes +.It PMC_CAP_USER Ta Yes +.It PMC_CAP_WRITE Ta Yes +.El +.Ss Event Qualifiers +Event specifiers for these PMCs support the following common +qualifiers: +.Bl -tag -width indent +.It Li rsp= Ns Ar value +Configure the Off-core Response bits. +.Bl -tag -width indent +.It Li DMND_DATA_RD +Counts the number of demand and DCU prefetch data reads of full +and partial cachelines as well as demand data page table entry +cacheline reads. Does not count L2 data read prefetches or +instruction fetches. +.It Li DMND_RFO +Counts the number of demand and DCU prefetch reads for ownership +(RFO) requests generated by a write to data cacheline. Does not +count L2 RFO. +.It Li DMND_IFETCH +Counts the number of demand and DCU prefetch instruction cacheline +reads. Does not count L2 code read prefetches. +WB +Counts the number of writeback (modified to exclusive) transactions. +.It Li PF_DATA_RD +Counts the number of data cacheline reads generated by L2 prefetchers. +.It Li PF_RFO +Counts the number of RFO requests generated by L2 prefetchers. +.It Li PF_IFETCH +Counts the number of code reads generated by L2 prefetchers. +.It Li OTHER +Counts one of the following transaction types, including L3 invalidate, +I/O, full or partial writes, WC or non-temporal stores, CLFLUSH, Fences, +lock, unlock, split lock. +.It Li UNCORE_HIT +L3 Hit: local or remote home requests that hit L3 cache in the uncore +with no coherency actions required (snooping). +.It Li OTHER_CORE_HIT_SNP +L3 Hit: local or remote home requests that hit L3 cache in the uncore +and was serviced by another core with a cross core snoop where no modified +copies were found (clean). +.It Li OTHER_CORE_HITM +L3 Hit: local or remote home requests that hit L3 cache in the uncore +and was serviced by another core with a cross core snoop where modified +copies were found (HITM). +.It Li REMOTE_CACHE_FWD +L3 Miss: local homed requests that missed the L3 cache and was serviced +by forwarded data following a cross package snoop where no modified +copies found. (Remote home requests are not counted) +.It Li REMOTE_DRAM +L3 Miss: remote home requests that missed the L3 cache and were serviced +by remote DRAM. +.It Li LOCAL_DRAM +L3 Miss: local home requests that missed the L3 cache and were serviced +by local DRAM. +.It Li NON_DRAM +Non-DRAM requests that were serviced by IOH. +.El +.It Li cmask= Ns Ar value +Configure the PMC to increment only if the number of configured +events measured in a cycle is greater than or equal to +.Ar value . +.It Li edge +Configure the PMC to count the number of de-asserted to asserted +transitions of the conditions expressed by the other qualifiers. +If specified, the counter will increment only once whenever a +condition becomes true, irrespective of the number of clocks during +which the condition remains true. +.It Li inv +Invert the sense of comparison when the +.Dq Li cmask +qualifier is present, making the counter increment when the number of +events per cycle is less than the value specified by the +.Dq Li cmask +qualifier. +.It Li os +Configure the PMC to count events happening at processor privilege +level 0. +.It Li usr +Configure the PMC to count events occurring at privilege levels 1, 2 +or 3. +.El +.Pp +If neither of the +.Dq Li os +or +.Dq Li usr +qualifiers are specified, the default is to enable both. +.Ss Event Specifiers (Programmable PMCs) +Westmere programmable PMCs support the following events: +.Bl -tag -width indent +.It Li LOAD_BLOCK.OVERLAP_STORE +.Pq Event 03H , Umask 02H +Loads that partially overlap an earlier store +.It Li SB_DRAIN.ANY +.Pq Event 04H , Umask 07H +All Store buffer stall cycles +.It Li MISALIGN_MEMORY.STORE +.Pq Event 05H , Umask 02H +All store referenced with misaligned address +.It Li STORE_BLOCKS.AT_RET +.Pq Event 06H , Umask 04H +Counts number of loads delayed with at-Retirement block code. The following +loads need to be executed at retirement and wait for all senior stores on +the same thread to be drained: load splitting across 4K boundary (page +split), load accessing uncacheable (UC or USWC) memory, load lock, and load +with page table in UC or USWC memory region. +.It Li STORE_BLOCKS.L1D_BLOCK +.Pq Event 06H , Umask 08H +Cacheable loads delayed with L1D block code +.It Li PARTIAL_ADDRESS_ALIAS +.Pq Event 07H , Umask 01H +Counts false dependency due to partial address aliasing +.It Li DTLB_LOAD_MISSES.ANY +.Pq Event 08H , Umask 01H +Counts all load misses that cause a page walk +.It Li DTLB_LOAD_MISSES.WALK_COMPLETED +.Pq Event 08H , Umask 02H +Counts number of completed page walks due to load miss in the STLB. +.It Li DTLB_LOAD_MISSES.WALK_CYCLES +.Pq Event 08H , Umask 04H +Cycles PMH is busy with a page walk due to a load miss in the STLB. +.It Li DTLB_LOAD_MISSES.STLB_HIT +.Pq Event 08H , Umask 10H +Number of cache load STLB hits +.It Li DTLB_LOAD_MISSES.PDE_MISS +.Pq Event 08H , Umask 20H +Number of DTLB cache load misses where the low part of the linear to +physical address translation was missed. +.It Li MEM_INST_RETIRED.LOADS +.Pq Event 0BH , Umask 01H +Counts the number of instructions with an architecturally-visible store +retired on the architected path. +In conjunction with ld_lat facility +.It Li MEM_INST_RETIRED.STORES +.Pq Event 0BH , Umask 02H +Counts the number of instructions with an architecturally-visible store +retired on the architected path. +In conjunction with ld_lat facility +.It Li MEM_INST_RETIRED.LATENCY_ABOVE_THRESHOLD +.Pq Event 0BH , Umask 10H +Counts the number of instructions exceeding the latency specified with +ld_lat facility. +In conjunction with ld_lat facility +.It Li MEM_STORE_RETIRED.DTLB_MISS +.Pq Event 0CH , Umask 01H +The event counts the number of retired stores that missed the DTLB. The DTLB +miss is not counted if the store operation causes a fault. Does not counter +prefetches. Counts both primary and secondary misses to the TLB +.It Li UOPS_ISSUED.ANY +.Pq Event 0EH , Umask 01H +Counts the number of Uops issued by the Register Allocation Table to the +Reservation Station, i.e. the UOPs issued from the front end to the back +end. +.It Li UOPS_ISSUED.STALLED_CYCLES +.Pq Event 0EH , Umask 01H +Counts the number of cycles no Uops issued by the Register Allocation Table +to the Reservation Station, i.e. the UOPs issued from the front end to the +back end. +set invert=1, cmask = 1 +.It Li UOPS_ISSUED.FUSED +.Pq Event 0EH , Umask 02H +Counts the number of fused Uops that were issued from the Register +Allocation Table to the Reservation Station. +.It Li MEM_UNCORE_RETIRED.LOCAL_HITM +.Pq Event 0FH , Umask 02H +Load instructions retired that HIT modified data in sibling core (Precise +Event) +.It Li MEM_UNCORE_RETIRED.LOCAL_DRAM_AND_REMOTE_CACHE_HIT +.Pq Event 0FH , Umask 08H +Load instructions retired local dram and remote cache HIT data sources +(Precise Event) +.It Li MEM_UNCORE_RETIRED.LOCAL_DRAM +.Pq Event 0FH , Umask 10H +Load instructions retired with a data source of local DRAM or locally homed +remote cache HITM (Precise Event) +.It Li MEM_UNCORE_RETIRED.REMOTE_DRAM +.Pq Event 0FH , Umask 20H +Load instructions retired remote DRAM and remote home-remote cache HITM +(Precise Event) +.It Li MEM_UNCORE_RETIRED.UNCACHEABLE +.Pq Event 0FH , Umask 80H +Load instructions retired I/O (Precise Event) +.It Li FP_COMP_OPS_EXE.X87 +.Pq Event 10H , Umask 01H +Counts the number of FP Computational Uops Executed. The number of FADD, +FSUB, FCOM, FMULs, integer MULsand IMULs, FDIVs, FPREMs, FSQRTS, integer +DIVs, and IDIVs. This event does not distinguish an FADD used in the middle +of a transcendental flow from a separate FADD instruction. +.It Li FP_COMP_OPS_EXE.MMX +.Pq Event 10H , Umask 02H +Counts number of MMX Uops executed. +.It Li FP_COMP_OPS_EXE.SSE_FP +.Pq Event 10H , Umask 04H +Counts number of SSE and SSE2 FP uops executed. +.It Li FP_COMP_OPS_EXE.SSE2_INTEGER +.Pq Event 10H , Umask 08H +Counts number of SSE2 integer uops executed. +.It Li FP_COMP_OPS_EXE.SSE_FP_PACKED +.Pq Event 10H , Umask 10H +Counts number of SSE FP packed uops executed. +.It Li FP_COMP_OPS_EXE.SSE_FP_SCALAR +.Pq Event 10H , Umask 20H +Counts number of SSE FP scalar uops executed. +.It Li FP_COMP_OPS_EXE.SSE_SINGLE_PRECISION +.Pq Event 10H , Umask 40H +Counts number of SSE* FP single precision uops executed. +.It Li FP_COMP_OPS_EXE.SSE_DOUBLE_PRECISION +.Pq Event 10H , Umask 80H +Counts number of SSE* FP double precision uops executed. +.It Li SIMD_INT_128.PACKED_MPY +.Pq Event 12H , Umask 01H +Counts number of 128 bit SIMD integer multiply operations. +.It Li SIMD_INT_128.PACKED_SHIFT +.Pq Event 12H , Umask 02H +Counts number of 128 bit SIMD integer shift operations. +.It Li SIMD_INT_128.PACK +.Pq Event 12H , Umask 04H +Counts number of 128 bit SIMD integer pack operations. +.It Li SIMD_INT_128.UNPACK +.Pq Event 12H , Umask 08H +Counts number of 128 bit SIMD integer unpack operations. +.It Li SIMD_INT_128.PACKED_LOGICAL +.Pq Event 12H , Umask 10H +Counts number of 128 bit SIMD integer logical operations. +.It Li SIMD_INT_128.PACKED_ARITH +.Pq Event 12H , Umask 20H +Counts number of 128 bit SIMD integer arithmetic operations. +.It Li SIMD_INT_128.SHUFFLE_MOVE +.Pq Event 12H , Umask 40H +Counts number of 128 bit SIMD integer shuffle and move operations. +.It Li LOAD_DISPATCH.RS +.Pq Event 13H , Umask 01H +Counts number of loads dispatched from the Reservation Station that bypass +the Memory Order Buffer. +.It Li LOAD_DISPATCH.RS_DELAYED +.Pq Event 13H , Umask 02H +Counts the number of delayed RS dispatches at the stage latch. If an RS +dispatch can not bypass to LB, it has another chance to dispatch from the +one-cycle delayed staging latch before it is written into the LB. +.It Li LOAD_DISPATCH.MOB +.Pq Event 13H , Umask 04H +Counts the number of loads dispatched from the Reservation Station to the +Memory Order Buffer. +.It Li LOAD_DISPATCH.ANY +.Pq Event 13H , Umask 07H +Counts all loads dispatched from the Reservation Station. +.It Li ARITH.CYCLES_DIV_BUSY +.Pq Event 14H , Umask 01H +Counts the number of cycles the divider is busy executing divide or square +root operations. The divide can be integer, X87 or Streaming SIMD Extensions +(SSE). The square root operation can be either X87 or SSE. +Set 'edge =1, invert=1, cmask=1' to count the number of divides. +Count may be incorrect When SMT is on +.It Li ARITH.MUL +.Pq Event 14H , Umask 02H +Counts the number of multiply operations executed. This includes integer as +well as floating point multiply operations but excludes DPPS mul and MPSAD. +Count may be incorrect When SMT is on +.It Li INST_QUEUE_WRITES +.Pq Event 17H , Umask 01H +Counts the number of instructions written into the instruction queue every +cycle. +.It Li INST_DECODED.DEC0 +.Pq Event 18H , Umask 01H +Counts number of instructions that require decoder 0 to be decoded. Usually, +this means that the instruction maps to more than 1 uop +.It Li TWO_UOP_INSTS_DECODED +.Pq Event 19H , Umask 01H +An instruction that generates two uops was decoded +.It Li INST_QUEUE_WRITE_CYCLES +.Pq Event 1EH , Umask 01H +This event counts the number of cycles during which instructions are written +to the instruction queue. Dividing this counter by the number of +instructions written to the instruction queue (INST_QUEUE_WRITES) yields the +average number of instructions decoded each cycle. If this number is less +than four and the pipe stalls, this indicates that the decoder is failing to +decode enough instructions per cycle to sustain the 4-wide pipeline. +If SSE* instructions that are 6 bytes or longer arrive one after another, +then front end throughput may limit execution speed. In such case, +.It Li LSD_OVERFLOW +.Pq Event 20H , Umask 01H +Number of loops that can not stream from the instruction queue. +.It Li L2_RQSTS.LD_HIT +.Pq Event 24H , Umask 01H +Counts number of loads that hit the L2 cache. L2 loads include both L1D +demand misses as well as L1D prefetches. L2 loads can be rejected for +various reasons. Only non rejected loads are counted. +.It Li L2_RQSTS.LD_MISS +.Pq Event 24H , Umask 02H +Counts the number of loads that miss the L2 cache. L2 loads include both L1D +demand misses as well as L1D prefetches. +.It Li L2_RQSTS.LOADS +.Pq Event 24H , Umask 03H +Counts all L2 load requests. L2 loads include both L1D demand misses as well +as L1D prefetches. +.It Li L2_RQSTS.RFO_HIT +.Pq Event 24H , Umask 04H +Counts the number of store RFO requests that hit the L2 cache. L2 RFO +requests include both L1D demand RFO misses as well as L1D RFO prefetches. +Count includes WC memory requests, where the data is not fetched but the +permission to write the line is required. +.It Li L2_RQSTS.RFO_MISS +.Pq Event 24H , Umask 08H +Counts the number of store RFO requests that miss the L2 cache. L2 RFO +requests include both L1D demand RFO misses as well as L1D RFO prefetches. +.It Li L2_RQSTS.RFOS +.Pq Event 24H , Umask 0CH +Counts all L2 store RFO requests. L2 RFO requests include both L1D demand +RFO misses as well as L1D RFO prefetches.. +.It Li L2_RQSTS.IFETCH_HIT +.Pq Event 24H , Umask 10H +Counts number of instruction fetches that hit the L2 cache. L2 instruction +fetches include both L1I demand misses as well as L1I instruction +prefetches. +.It Li L2_RQSTS.IFETCH_MISS +.Pq Event 24H , Umask 20H +Counts number of instruction fetches that miss the L2 cache. L2 instruction +fetches include both L1I demand misses as well as L1I instruction +prefetches. +.It Li L2_RQSTS.IFETCHES +.Pq Event 24H , Umask 30H +Counts all instruction fetches. L2 instruction fetches include both L1I +demand misses as well as L1I instruction prefetches. +.It Li L2_RQSTS.PREFETCH_HIT +.Pq Event 24H , Umask 40H +Counts L2 prefetch hits for both code and data. +.It Li L2_RQSTS.PREFETCH_MISS +.Pq Event 24H , Umask 80H +Counts L2 prefetch misses for both code and data. +.It Li L2_RQSTS.PREFETCHES +.Pq Event 24H , Umask C0H +Counts all L2 prefetches for both code and data. +.It Li L2_RQSTS.MISS +.Pq Event 24H , Umask AAH +Counts all L2 misses for both code and data. +.It Li L2_RQSTS.REFERENCES +.Pq Event 24H , Umask FFH +Counts all L2 requests for both code and data. +.It Li L2_DATA_RQSTS.DEMAND.I_STATE +.Pq Event 26H , Umask 01H +Counts number of L2 data demand loads where the cache line to be loaded is +in the I (invalid) state, i.e. a cache miss. L2 demand loads are both L1D +demand misses and L1D prefetches. +.It Li L2_DATA_RQSTS.DEMAND.S_STATE +.Pq Event 26H , Umask 02H +Counts number of L2 data demand loads where the cache line to be loaded is +in the S (shared) state. L2 demand loads are both L1D demand misses and L1D +prefetches. +.It Li L2_DATA_RQSTS.DEMAND.E_STATE +.Pq Event 26H , Umask 04H +Counts number of L2 data demand loads where the cache line to be loaded is +in the E (exclusive) state. L2 demand loads are both L1D demand misses and +L1D prefetches. +.It Li L2_DATA_RQSTS.DEMAND.M_STATE +.Pq Event 26H , Umask 08H +Counts number of L2 data demand loads where the cache line to be loaded is +in the M (modified) state. L2 demand loads are both L1D demand misses and +L1D prefetches. +.It Li L2_DATA_RQSTS.DEMAND.MESI +.Pq Event 26H , Umask 0FH +Counts all L2 data demand requests. L2 demand loads are both L1D demand +misses and L1D prefetches. +.It Li L2_DATA_RQSTS.PREFETCH.I_STATE +.Pq Event 26H , Umask 10H +Counts number of L2 prefetch data loads where the cache line to be loaded is +in the I (invalid) state, i.e. a cache miss. +.It Li L2_DATA_RQSTS.PREFETCH.S_STATE +.Pq Event 26H , Umask 20H +Counts number of L2 prefetch data loads where the cache line to be loaded is +in the S (shared) state. A prefetch RFO will miss on an S state line, while +a prefetch read will hit on an S state line. +.It Li L2_DATA_RQSTS.PREFETCH.E_STATE +.Pq Event 26H , Umask 40H +Counts number of L2 prefetch data loads where the cache line to be loaded is +in the E (exclusive) state. +.It Li L2_DATA_RQSTS.PREFETCH.M_STATE +.Pq Event 26H , Umask 80H +Counts number of L2 prefetch data loads where the cache line to be loaded is +in the M (modified) state. +.It Li L2_DATA_RQSTS.PREFETCH.MESI +.Pq Event 26H , Umask F0H +Counts all L2 prefetch requests. +.It Li L2_DATA_RQSTS.ANY +.Pq Event 26H , Umask FFH +Counts all L2 data requests. +.It Li L2_WRITE.RFO.I_STATE +.Pq Event 27H , Umask 01H +Counts number of L2 demand store RFO requests where the cache line to be +loaded is in the I (invalid) state, i.e, a cache miss. The L1D prefetcher +does not issue a RFO prefetch. +This is a demand RFO request +.It Li L2_WRITE.RFO.S_STATE +.Pq Event 27H , Umask 02H +Counts number of L2 store RFO requests where the cache line to be loaded is +in the S (shared) state. The L1D prefetcher does not issue a RFO prefetch,. +This is a demand RFO request +.It Li L2_WRITE.RFO.M_STATE +.Pq Event 27H , Umask 08H +Counts number of L2 store RFO requests where the cache line to be loaded is +in the M (modified) state. The L1D prefetcher does not issue a RFO prefetch. +This is a demand RFO request +.It Li L2_WRITE.RFO.HIT +.Pq Event 27H , Umask 0EH +Counts number of L2 store RFO requests where the cache line to be loaded is +in either the S, E or M states. The L1D prefetcher does not issue a RFO +prefetch. +This is a demand RFO request +.It Li L2_WRITE.RFO.MESI +.Pq Event 27H , Umask 0FH +Counts all L2 store RFO requests.The L1D prefetcher does not issue a RFO +prefetch. +This is a demand RFO request +.It Li L2_WRITE.LOCK.I_STATE +.Pq Event 27H , Umask 10H +Counts number of L2 demand lock RFO requests where the cache line to be +loaded is in the I (invalid) state, i.e. a cache miss. +.It Li L2_WRITE.LOCK.S_STATE +.Pq Event 27H , Umask 20H +Counts number of L2 lock RFO requests where the cache line to be loaded is +in the S (shared) state. +.It Li L2_WRITE.LOCK.E_STATE +.Pq Event 27H , Umask 40H +Counts number of L2 demand lock RFO requests where the cache line to be +loaded is in the E (exclusive) state. +.It Li L2_WRITE.LOCK.M_STATE +.Pq Event 27H , Umask 80H +Counts number of L2 demand lock RFO requests where the cache line to be +loaded is in the M (modified) state. +.It Li L2_WRITE.LOCK.HIT +.Pq Event 27H , Umask E0H +Counts number of L2 demand lock RFO requests where the cache line to be +loaded is in either the S, E, or M state. +.It Li L2_WRITE.LOCK.MESI +.Pq Event 27H , Umask F0H +Counts all L2 demand lock RFO requests. +.It Li L1D_WB_L2.I_STATE +.Pq Event 28H , Umask 01H +Counts number of L1 writebacks to the L2 where the cache line to be written +is in the I (invalid) state, i.e. a cache miss. +.It Li L1D_WB_L2.S_STATE +.Pq Event 28H , Umask 02H +Counts number of L1 writebacks to the L2 where the cache line to be written +is in the S state. +.It Li L1D_WB_L2.E_STATE +.Pq Event 28H , Umask 04H +Counts number of L1 writebacks to the L2 where the cache line to be written +is in the E (exclusive) state. +.It Li L1D_WB_L2.M_STATE +.Pq Event 28H , Umask 08H +Counts number of L1 writebacks to the L2 where the cache line to be written +is in the M (modified) state. +.It Li L1D_WB_L2.MESI +.Pq Event 28H , Umask 0FH +Counts all L1 writebacks to the L2. +.It Li L3_LAT_CACHE.REFERENCE +.Pq Event 2EH , Umask 02H +Counts uncore Last Level Cache references. Because cache hierarchy, cache +sizes and other implementation-specific characteristics; value comparison to +estimate performance differences is not recommended. +see Table A-1 +.It Li L3_LAT_CACHE.MISS +.Pq Event 2EH , Umask 01H +Counts uncore Last Level Cache misses. Because cache hierarchy, cache sizes +and other implementation-specific characteristics; value comparison to +estimate performance differences is not recommended. +see Table A-1 +.It Li CPU_CLK_UNHALTED.THREAD_P +.Pq Event 3CH , Umask 00H +Counts the number of thread cycles while the thread is not in a halt state. +The thread enters the halt state when it is running the HLT instruction. The +core frequency may change from time to time due to power or thermal +throttling. +see Table A-1 +.It Li CPU_CLK_UNHALTED.REF_P +.Pq Event 3CH , Umask 01H +Increments at the frequency of TSC when not halted. +see Table A-1 +.It Li DTLB_MISSES.ANY +.Pq Event 49H , Umask 01H +Counts the number of misses in the STLB which causes a page walk. +.It Li DTLB_MISSES.WALK_COMPLETED +.Pq Event 49H , Umask 02H +Counts number of misses in the STLB which resulted in a completed page walk. +.It Li DTLB_MISSES.WALK_CYCLES +.Pq Event 49H , Umask 04H +Counts cycles of page walk due to misses in the STLB. +.It Li DTLB_MISSES.STLB_HIT +.Pq Event 49H , Umask 10H +Counts the number of DTLB first level misses that hit in the second level +TLB. This event is only relevant if the core contains multiple DTLB levels. +.It Li DTLB_MISSES.LARGE_WALK_COMPLETED +.Pq Event 49H , Umask 80H +Counts number of completed large page walks due to misses in the STLB. +.It Li LOAD_HIT_PRE +.Pq Event 4CH , Umask 01H +Counts load operations sent to the L1 data cache while a previous SSE +prefetch instruction to the same cache line has started prefetching but has +not yet finished. +.It Li L1D_PREFETCH.REQUESTS +.Pq Event 4EH , Umask 01H +Counts number of hardware prefetch requests dispatched out of the prefetch +FIFO. +.It Li L1D_PREFETCH.MISS +.Pq Event 4EH , Umask 02H +Counts number of hardware prefetch requests that miss the L1D. There are two +prefetchers in the L1D. A streamer, which predicts lines sequentially after +this one should be fetched, and the IP prefetcher that remembers access +patterns for the current instruction. The streamer prefetcher stops on an +L1D hit, while the IP prefetcher does not. +.It Li L1D_PREFETCH.TRIGGERS +.Pq Event 4EH , Umask 04H +Counts number of prefetch requests triggered by the Finite State Machine and +pushed into the prefetch FIFO. Some of the prefetch requests are dropped due +to overwrites or competition between the IP index prefetcher and streamer +prefetcher. The prefetch FIFO contains 4 entries. +.It Li EPT.WALK_CYCLES +.Pq Event 4FH , Umask 10H +Counts Extended Page walk cycles. +.It Li L1D.REPL +.Pq Event 51H , Umask 01H +Counts the number of lines brought into the L1 data cache. +Counter 0, 1 only +.It Li L1D.M_REPL +.Pq Event 51H , Umask 02H +Counts the number of modified lines brought into the L1 data cache. +Counter 0, 1 only +.It Li L1D.M_EVICT +.Pq Event 51H , Umask 04H +Counts the number of modified lines evicted from the L1 data cache due to +replacement. +Counter 0, 1 only +.It Li L1D.M_SNOOP_EVICT +.Pq Event 51H , Umask 08H +Counts the number of modified lines evicted from the L1 data cache due to +snoop HITM intervention. +Counter 0, 1 only +.It Li L1D_CACHE_PREFETCH_LOCK_FB_HIT +.Pq Event 52H , Umask 01H +Counts the number of cacheable load lock speculated instructions accepted +into the fill buffer. +.It Li L1D_CACHE_LOCK_FB_HIT +.Pq Event 53H , Umask 01H +Counts the number of cacheable load lock speculated or retired instructions +accepted into the fill buffer. +.It Li OFFCORE_REQUESTS_OUTSTANDING.DEMAND.READ_DATA +.Pq Event 60H , Umask 01H +Counts weighted cycles of offcore demand data read requests. Does not +include L2 prefetch requests. +counter 0 +.It Li OFFCORE_REQUESTS_OUTSTANDING.DEMAND.READ_CODE +.Pq Event 60H , Umask 02H +Counts weighted cycles of offcore demand code read requests. Does not +include L2 prefetch requests. +counter 0 +.It Li OFFCORE_REQUESTS_OUTSTANDING.DEMAND.RFO +.Pq Event 60H , Umask 04H +Counts weighted cycles of offcore demand RFO requests. Does not include L2 +prefetch requests. +counter 0 +.It Li OFFCORE_REQUESTS_OUTSTANDING.ANY.READ +.Pq Event 60H , Umask 08H +Counts weighted cycles of offcore read requests of any kind. Include L2 +prefetch requests. +counter 0 +.It Li CACHE_LOCK_CYCLES.L1D_L2 +.Pq Event 63H , Umask 01H +Cycle count during which the L1D and L2 are locked. A lock is asserted when +there is a locked memory access, due to uncacheable memory, a locked +operation that spans two cache lines, or a page walk from an uncacheable +page table. +Counter 0, 1 only. L1D and L2 locks have a very high performance penalty and +it is highly recommended to avoid such accesses. +.It Li CACHE_LOCK_CYCLES.L1D +.Pq Event 63H , Umask 02H +Counts the number of cycles that cacheline in the L1 data cache unit is +locked. +Counter 0, 1 only. +.It Li IO_TRANSACTIONS +.Pq Event 6CH , Umask 01H +Counts the number of completed I/O transactions. +.It Li L1I.HITS +.Pq Event 80H , Umask 01H +Counts all instruction fetches that hit the L1 instruction cache. +.It Li L1I.MISSES +.Pq Event 80H , Umask 02H +Counts all instruction fetches that miss the L1I cache. This includes +instruction cache misses, streaming buffer misses, victim cache misses and +uncacheable fetches. An instruction fetch miss is counted only once and not +once for every cycle it is outstanding. +.It Li L1I.READS +.Pq Event 80H , Umask 03H +Counts all instruction fetches, including uncacheable fetches that bypass +the L1I. +.It Li L1I.CYCLES_STALLED +.Pq Event 80H , Umask 04H +Cycle counts for which an instruction fetch stalls due to a L1I cache miss, +ITLB miss or ITLB fault. +.It Li LARGE_ITLB.HIT +.Pq Event 82H , Umask 01H +Counts number of large ITLB hits. +.It Li ITLB_MISSES.ANY +.Pq Event 85H , Umask 01H +Counts the number of misses in all levels of the ITLB which causes a page +walk. +.It Li ITLB_MISSES.WALK_COMPLETED +.Pq Event 85H , Umask 02H +Counts number of misses in all levels of the ITLB which resulted in a +completed page walk. +.It Li ITLB_MISSES.WALK_CYCLES +.Pq Event 85H , Umask 04H +Counts ITLB miss page walk cycles. +.It Li ITLB_MISSES.LARGE_WALK_COMPLETED +.Pq Event 85H , Umask 80H +Counts number of completed large page walks due to misses in the STLB. +.It Li ILD_STALL.LCP +.Pq Event 87H , Umask 01H +Cycles Instruction Length Decoder stalls due to length changing prefixes: +66, 67 or REX.W (for EM64T) instructions which change the length of the +decoded instruction. +.It Li ILD_STALL.MRU +.Pq Event 87H , Umask 02H +Instruction Length Decoder stall cycles due to Brand Prediction Unit (PBU) +Most Recently Used (MRU) bypass. +.It Li ILD_STALL.IQ_FULL +.Pq Event 87H , Umask 04H +Stall cycles due to a full instruction queue. +.It Li ILD_STALL.REGEN +.Pq Event 87H , Umask 08H +Counts the number of regen stalls. +.It Li ILD_STALL.ANY +.Pq Event 87H , Umask 0FH +Counts any cycles the Instruction Length Decoder is stalled. +.It Li BR_INST_EXEC.COND +.Pq Event 88H , Umask 01H +Counts the number of conditional near branch instructions executed, but not +necessarily retired. +.It Li BR_INST_EXEC.DIRECT +.Pq Event 88H , Umask 02H +Counts all unconditional near branch instructions excluding calls and +indirect branches. +.It Li BR_INST_EXEC.INDIRECT_NON_CALL +.Pq Event 88H , Umask 04H +Counts the number of executed indirect near branch instructions that are not +calls. +.It Li BR_INST_EXEC.NON_CALLS +.Pq Event 88H , Umask 07H +Counts all non call near branch instructions executed, but not necessarily +retired. +.It Li BR_INST_EXEC.RETURN_NEAR +.Pq Event 88H , Umask 08H +Counts indirect near branches that have a return mnemonic. +.It Li BR_INST_EXEC.DIRECT_NEAR_CALL +.Pq Event 88H , Umask 10H +Counts unconditional near call branch instructions, excluding non call +branch, executed. +.It Li BR_INST_EXEC.INDIRECT_NEAR_CALL +.Pq Event 88H , Umask 20H +Counts indirect near calls, including both register and memory indirect, +executed. +.It Li BR_INST_EXEC.NEAR_CALLS +.Pq Event 88H , Umask 30H +Counts all near call branches executed, but not necessarily retired. +.It Li BR_INST_EXEC.TAKEN +.Pq Event 88H , Umask 40H +Counts taken near branches executed, but not necessarily retired. +.It Li BR_INST_EXEC.ANY +.Pq Event 88H , Umask 7FH +Counts all near executed branches (not necessarily retired). This includes +only instructions and not micro-op branches. Frequent branching is not +necessarily a major performance issue. However frequent branch +mispredictions may be a problem. +.It Li BR_MISP_EXEC.COND +.Pq Event 89H , Umask 01H +Counts the number of mispredicted conditional near branch instructions +executed, but not necessarily retired. +.It Li BR_MISP_EXEC.DIRECT +.Pq Event 89H , Umask 02H +Counts mispredicted macro unconditional near branch instructions, excluding +calls and indirect branches (should always be 0). +.It Li BR_MISP_EXEC.INDIRECT_NON_CALL +.Pq Event 89H , Umask 04H +Counts the number of executed mispredicted indirect near branch instructions +that are not calls. +.It Li BR_MISP_EXEC.NON_CALLS +.Pq Event 89H , Umask 07H +Counts mispredicted non call near branches executed, but not necessarily +retired. +.It Li BR_MISP_EXEC.RETURN_NEAR +.Pq Event 89H , Umask 08H +Counts mispredicted indirect branches that have a rear return mnemonic. +.It Li BR_MISP_EXEC.DIRECT_NEAR_CALL +.Pq Event 89H , Umask 10H +Counts mispredicted non-indirect near calls executed, (should always be 0). +.It Li BR_MISP_EXEC.INDIRECT_NEAR_CALL +.Pq Event 89H , Umask 20H +Counts mispredicted indirect near calls executed, including both register +and memory indirect. +.It Li BR_MISP_EXEC.NEAR_CALLS +.Pq Event 89H , Umask 30H +Counts all mispredicted near call branches executed, but not necessarily +retired. +.It Li BR_MISP_EXEC.TAKEN +.Pq Event 89H , Umask 40H +Counts executed mispredicted near branches that are taken, but not +necessarily retired. +.It Li BR_MISP_EXEC.ANY +.Pq Event 89H , Umask 7FH +Counts the number of mispredicted near branch instructions that were +executed, but not necessarily retired. +.It Li RESOURCE_STALLS.ANY +.Pq Event A2H , Umask 01H +Counts the number of Allocator resource related stalls. Includes register +renaming buffer entries, memory buffer entries. In addition to resource +related stalls, this event counts some other events. Includes stalls arising +during branch misprediction recovery, such as if retirement of the +mispredicted branch is delayed and stalls arising while store buffer is +draining from synchronizing operations. +Does not include stalls due to SuperQ (off core) queue full, too many cache +misses, etc. +.It Li RESOURCE_STALLS.LOAD +.Pq Event A2H , Umask 02H +Counts the cycles of stall due to lack of load buffer for load operation. +.It Li RESOURCE_STALLS.RS_FULL +.Pq Event A2H , Umask 04H +This event counts the number of cycles when the number of instructions in +the pipeline waiting for execution reaches the limit the processor can +handle. A high count of this event indicates that there are long latency +operations in the pipe (possibly load and store operations that miss the L2 +cache, or instructions dependent upon instructions further down the pipeline +that have yet to retire. +When RS is full, new instructions can not enter the reservation station and +start execution. +.It Li RESOURCE_STALLS.STORE +.Pq Event A2H , Umask 08H +This event counts the number of cycles that a resource related stall will +occur due to the number of store instructions reaching the limit of the +pipeline, (i.e. all store buffers are used). The stall ends when a store +instruction commits its data to the cache or memory. +.It Li RESOURCE_STALLS.ROB_FULL +.Pq Event A2H , Umask 10H +Counts the cycles of stall due to re- order buffer full. +.It Li RESOURCE_STALLS.FPCW +.Pq Event A2H , Umask 20H +Counts the number of cycles while execution was stalled due to writing the +floating-point unit (FPU) control word. +.It Li RESOURCE_STALLS.MXCSR +.Pq Event A2H , Umask 40H +Stalls due to the MXCSR register rename occurring to close to a previous +MXCSR rename. The MXCSR provides control and status for the MMX registers. +.It Li RESOURCE_STALLS.OTHER +.Pq Event A2H , Umask 80H +Counts the number of cycles while execution was stalled due to other +resource issues. +.It Li MACRO_INSTS.FUSIONS_DECODED +.Pq Event A6H , Umask 01H +Counts the number of instructions decoded that are macro-fused but not +necessarily executed or retired. +.It Li BACLEAR_FORCE_IQ +.Pq Event A7H , Umask 01H +Counts number of times a BACLEAR was forced by the Instruction Queue. The IQ +is also responsible for providing conditional branch prediction direction +based on a static scheme and dynamic data provided by the L2 Branch +Prediction Unit. If the conditional branch target is not found in the Target +Array and the IQ predicts that the branch is taken, then the IQ will force +the Branch Address Calculator to issue a BACLEAR. Each BACLEAR asserted by +the BAC generates approximately an 8 cycle bubble in the instruction fetch +pipeline. +.It Li LSD.UOPS +.Pq Event A8H , Umask 01H +Counts the number of micro-ops delivered by loop stream detector +Use cmask=1 and invert to count cycles +.It Li ITLB_FLUSH +.Pq Event AEH , Umask 01H +Counts the number of ITLB flushes +.It Li OFFCORE_REQUESTS.DEMAND.READ_DATA +.Pq Event B0H , Umask 01H +Counts number of offcore demand data read requests. Does not count L2 +prefetch requests. +.It Li OFFCORE_REQUESTS.DEMAND.READ_CODE +.Pq Event B0H , Umask 02H +Counts number of offcore demand code read requests. Does not count L2 +prefetch requests. +.It Li OFFCORE_REQUESTS.DEMAND.RFO +.Pq Event B0H , Umask 04H +Counts number of offcore demand RFO requests. Does not count L2 prefetch +requests. +.It Li OFFCORE_REQUESTS.ANY.READ +.Pq Event B0H , Umask 08H +Counts number of offcore read requests. Includes L2 prefetch requests. +.It Li OFFCORE_REQUESTS.ANY.RFO +.Pq Event 80H , Umask 10H +Counts number of offcore RFO requests. Includes L2 prefetch requests. +.It Li OFFCORE_REQUESTS.L1D_WRITEBACK +.Pq Event B0H , Umask 40H +Counts number of L1D writebacks to the uncore. +.It Li OFFCORE_REQUESTS.ANY +.Pq Event B0H , Umask 80H +Counts all offcore requests. +.It Li UOPS_EXECUTED.PORT0 +.Pq Event B1H , Umask 01H +Counts number of Uops executed that were issued on port 0. Port 0 handles +integer arithmetic, SIMD and FP add Uops. +.It Li UOPS_EXECUTED.PORT1 +.Pq Event B1H , Umask 02H +Counts number of Uops executed that were issued on port 1. Port 1 handles +integer arithmetic, SIMD, integer shift, FP multiply and FP divide Uops. +.It Li UOPS_EXECUTED.PORT2_CORE +.Pq Event B1H , Umask 04H +Counts number of Uops executed that were issued on port 2. Port 2 handles +the load Uops. This is a core count only and can not be collected per +thread. +.It Li UOPS_EXECUTED.PORT3_CORE +.Pq Event B1H , Umask 08H +Counts number of Uops executed that were issued on port 3. Port 3 handles +store Uops. This is a core count only and can not be collected per thread. +.It Li UOPS_EXECUTED.PORT4_CORE +.Pq Event B1H , Umask 10H +Counts number of Uops executed that where issued on port 4. Port 4 handles +the value to be stored for the store Uops issued on port 3. This is a core +count only and can not be collected per thread. +.It Li UOPS_EXECUTED.CORE_ACTIVE_CYCLES_NO_PORT5 +.Pq Event B1H , Umask 1FH +Counts number of cycles there are one or more uops being executed and were +issued on ports 0-4. This is a core count only and can not be collected per +thread. +.It Li UOPS_EXECUTED.PORT5 +.Pq Event B1H , Umask 20H +Counts number of Uops executed that where issued on port 5. +.It Li UOPS_EXECUTED.CORE_ACTIVE_CYCLES +.Pq Event B1H , Umask 3FH +Counts number of cycles there are one or more uops being executed on any +ports. This is a core count only and can not be collected per thread. +.It Li UOPS_EXECUTED.PORT015 +.Pq Event B1H , Umask 40H +Counts number of Uops executed that where issued on port 0, 1, or 5. +use cmask=1, invert=1 to count stall cycles +.It Li UOPS_EXECUTED.PORT234 +.Pq Event B1H , Umask 80H +Counts number of Uops executed that where issued on port 2, 3, or 4. +.It Li OFFCORE_REQUESTS_SQ_FULL +.Pq Event B2H , Umask 01H +Counts number of cycles the SQ is full to handle off-core requests. +.It Li SNOOPQ_REQUESTS_OUTSTANDING.DATA +.Pq Event B3H , Umask 01H +Counts weighted cycles of snoopq requests for data. Counter 0 only +Use cmask=1 to count cycles not empty. +.It Li SNOOPQ_REQUESTS_OUTSTANDING.INVALIDATE +.Pq Event B3H , Umask 02H +Counts weighted cycles of snoopq invalidate requests. Counter 0 only +Use cmask=1 to count cycles not empty. +.It Li SNOOPQ_REQUESTS_OUTSTANDING.CODE +.Pq Event B3H , Umask 04H +Counts weighted cycles of snoopq requests for code. Counter 0 only +Use cmask=1 to count cycles not empty. +.It Li SNOOPQ_REQUESTS.CODE +.Pq Event B4H , Umask 01H +Counts the number of snoop code requests +.It Li SNOOPQ_REQUESTS.DATA +.Pq Event B4H , Umask 02H +Counts the number of snoop data requests +.It Li SNOOPQ_REQUESTS.INVALIDATE +.Pq Event B4H , Umask 04H +Counts the number of snoop invalidate requests +.It Li OFF_CORE_RESPONSE_0 +.Pq Event B7H , Umask 01H +see Section 30.6.1.3, Off-core Response Performance Monitoring in the +Processor Core. +Requires programming MSR 01A6H +.It Li SNOOP_RESPONSE.HIT +.Pq Event B8H , Umask 01H +Counts HIT snoop response sent by this thread in response to a snoop +request. +.It Li SNOOP_RESPONSE.HITE +.Pq Event B8H , Umask 02H +Counts HIT E snoop response sent by this thread in response to a snoop +request. +.It Li SNOOP_RESPONSE.HITM +.Pq Event B8H , Umask 04H +Counts HIT M snoop response sent by this thread in response to a snoop +request. +.It Li OFF_CORE_RESPONSE_1 +.Pq Event BBH , Umask 01H +see Section 30.6.1.3, Off-core Response Performance Monitoring in the +Processor Core +Use MSR 01A7H +.It Li INST_RETIRED.ANY_P +.Pq Event C0H , Umask 01H +See Table A-1 +Notes: INST_RETIRED.ANY is counted by a designated fixed counter. +INST_RETIRED.ANY_P is counted by a programmable counter and is an +architectural performance event. Event is supported if CPUID.A.EBX[1] = 0. +Counting: Faulting executions of GETSEC/VM entry/VM Exit/MWait will not +count as retired instructions. +.It Li INST_RETIRED.X87 +.Pq Event C0H , Umask 02H +Counts the number of floating point computational operations retired +floating point computational operations executed by the assist handler and +sub-operations of complex floating point instructions like transcendental +instructions. +.It Li INST_RETIRED.MMX +.Pq Event C0H , Umask 04H +Counts the number of retired: MMX instructions. +.It Li UOPS_RETIRED.ANY +.Pq Event C2H , Umask 01H +Counts the number of micro-ops retired, (macro-fused=1, micro- fused=2, +others=1; maximum count of 8 per cycle). Most instructions are composed of +one or two micro-ops. Some instructions are decoded into longer sequences +such as repeat instructions, floating point transcendental instructions, and +assists. +Use cmask=1 and invert to count active cycles or stalled cycles +.It Li UOPS_RETIRED.RETIRE_SLOTS +.Pq Event C2H , Umask 02H +Counts the number of retirement slots used each cycle +.It Li UOPS_RETIRED.MACRO_FUSED +.Pq Event C2H , Umask 04H +Counts number of macro-fused uops retired. +.It Li MACHINE_CLEARS.CYCLES +.Pq Event C3H , Umask 01H +Counts the cycles machine clear is asserted. +.It Li MACHINE_CLEARS.MEM_ORDER +.Pq Event C3H , Umask 02H +Counts the number of machine clears due to memory order conflicts. +.It Li MACHINE_CLEARS.SMC +.Pq Event C3H , Umask 04H +Counts the number of times that a program writes to a code section. +Self-modifying code causes a sever penalty in all Intel 64 and IA-32 +processors. The modified cache line is written back to the L2 and L3caches. +.It Li BR_INST_RETIRED.ALL_BRANCHES +.Pq Event C4H , Umask 00H +See Table A-1 +.It Li BR_INST_RETIRED.CONDITIONAL +.Pq Event C4H , Umask 01H +Counts the number of conditional branch instructions retired. +.It Li BR_INST_RETIRED.NEAR_CALL +.Pq Event C4H , Umask 02H +Counts the number of direct & indirect near unconditional calls retired +.It Li BR_INST_RETIRED.ALL_BRANCHES +.Pq Event C4H , Umask 04H +Counts the number of branch instructions retired +.It Li BR_MISP_RETIRED.ALL_BRANCHES +.Pq Event C5H , Umask 00H +See Table A-1 +.It Li BR_MISP_RETIRED.CONDITIONAL +.Pq Event C5H , Umask 01H +Counts mispredicted conditional retired calls. +.It Li BR_MISP_RETIRED.NEAR_CALL +.Pq Event C5H , Umask 02H +Counts mispredicted direct & indirect near unconditional retired calls. +.It Li BR_MISP_RETIRED.ALL_BRANCHES +.Pq Event C5H , Umask 04H +Counts all mispredicted retired calls. +.It Li SSEX_UOPS_RETIRED.PACKED_SINGLE +.Pq Event C7H , Umask 01H +Counts SIMD packed single-precision floating point Uops retired. +.It Li SSEX_UOPS_RETIRED.SCALAR_SINGLE +.Pq Event C7H , Umask 02H +Counts SIMD calar single-precision floating point Uops retired. +.It Li SSEX_UOPS_RETIRED.PACKED_DOUBLE +.Pq Event C7H , Umask 04H +Counts SIMD packed double- precision floating point Uops retired. +.It Li SSEX_UOPS_RETIRED.SCALAR_DOUBLE +.Pq Event C7H , Umask 08H +Counts SIMD scalar double-precision floating point Uops retired. +.It Li SSEX_UOPS_RETIRED.VECTOR_INTEGER +.Pq Event C7H , Umask 10H +Counts 128-bit SIMD vector integer Uops retired. +.It Li ITLB_MISS_RETIRED +.Pq Event C8H , Umask 20H +Counts the number of retired instructions that missed the ITLB when the +instruction was fetched. +.It Li MEM_LOAD_RETIRED.L1D_HIT +.Pq Event CBH , Umask 01H +Counts number of retired loads that hit the L1 data cache. +.It Li MEM_LOAD_RETIRED.L2_HIT +.Pq Event CBH , Umask 02H +Counts number of retired loads that hit the L2 data cache. +.It Li MEM_LOAD_RETIRED.L3_UNSHARED_HIT +.Pq Event CBH , Umask 04H +Counts number of retired loads that hit their own, unshared lines in the L3 +cache. +.It Li MEM_LOAD_RETIRED.OTHER_CORE_L2_HIT_HITM +.Pq Event CBH , Umask 08H +Counts number of retired loads that hit in a sibling core's L2 (on die +core). Since the L3 is inclusive of all cores on the package, this is an L3 +hit. This counts both clean or modified hits. +.It Li MEM_LOAD_RETIRED.L3_MISS +.Pq Event CBH , Umask 10H +Counts number of retired loads that miss the L3 cache. The load was +satisfied by a remote socket, local memory or an IOH. +.It Li MEM_LOAD_RETIRED.HIT_LFB +.Pq Event CBH , Umask 40H +Counts number of retired loads that miss the L1D and the address is located +in an allocated line fill buffer and will soon be committed to cache. This +is counting secondary L1D misses. +.It Li MEM_LOAD_RETIRED.DTLB_MISS +.Pq Event CBH , Umask 80H +Counts the number of retired loads that missed the DTLB. The DTLB miss is +not counted if the load operation causes a fault. This event counts loads +from cacheable memory only. The event does not count loads by software +prefetches. Counts both primary and secondary misses to the TLB. +.It Li FP_MMX_TRANS.TO_FP +.Pq Event CCH , Umask 01H +Counts the first floating-point instruction following any MMX instruction. +You can use this event to estimate the penalties for the transitions between +floating-point and MMX technology states. +.It Li FP_MMX_TRANS.TO_MMX +.Pq Event CCH , Umask 02H +Counts the first MMX instruction following a floating-point instruction. You +can use this event to estimate the penalties for the transitions between +floating-point and MMX technology states. +.It Li FP_MMX_TRANS.ANY +.Pq Event CCH , Umask 03H +Counts all transitions from floating point to MMX instructions and from MMX +instructions to floating point instructions. You can use this event to +estimate the penalties for the transitions between floating-point and MMX +technology states. +.It Li MACRO_INSTS.DECODED +.Pq Event D0H , Umask 01H +Counts the number of instructions decoded, (but not necessarily executed or +retired). +.It Li UOPS_DECODED.STALL_CYCLES +.Pq Event D1H , Umask 01H +Counts the cycles of decoder stalls. +.It Li UOPS_DECODED.MS +.Pq Event D1H , Umask 02H +Counts the number of Uops decoded by the Microcode Sequencer, MS. The MS +delivers uops when the instruction is more than 4 uops long or a microcode +assist is occurring. +.It Li UOPS_DECODED.ESP_FOLDING +.Pq Event D1H , Umask 04H +Counts number of stack pointer (ESP) instructions decoded: push , pop , call +, ret, etc. ESP instructions do not generate a Uop to increment or decrement +ESP. Instead, they update an ESP_Offset register that keeps track of the +delta to the current value of the ESP register. +.It Li UOPS_DECODED.ESP_SYNC +.Pq Event D1H , Umask 08H +Counts number of stack pointer (ESP) sync operations where an ESP +instruction is corrected by adding the ESP offset register to the current +value of the ESP register. +.It Li RAT_STALLS.FLAGS +.Pq Event D2H , Umask 01H +Counts the number of cycles during which execution stalled due to several +reasons, one of which is a partial flag register stall. A partial register +stall may occur when two conditions are met: 1) an instruction modifies +some, but not all, of the flags in the flag register and 2) the next +instruction, which depends on flags, depends on flags that were not modified +by this instruction. +.It Li RAT_STALLS.REGISTERS +.Pq Event D2H , Umask 02H +This event counts the number of cycles instruction execution latency became +longer than the defined latency because the instruction used a register that +was partially written by previous instruction. +.It Li RAT_STALLS.ROB_READ_PORT +.Pq Event D2H , Umask 04H +Counts the number of cycles when ROB read port stalls occurred, which did +not allow new micro-ops to enter the out-of-order pipeline. Note that, at +this stage in the pipeline, additional stalls may occur at the same cycle +and prevent the stalled micro-ops from entering the pipe. In such a case, +micro-ops retry entering the execution pipe in the next cycle and the +ROB-read port stall is counted again. +.It Li RAT_STALLS.SCOREBOARD +.Pq Event D2H , Umask 08H +Counts the cycles where we stall due to microarchitecturally required +serialization. Microcode scoreboarding stalls. +.It Li RAT_STALLS.ANY +.Pq Event D2H , Umask 0FH +Counts all Register Allocation Table stall cycles due to: Cycles when ROB +read port stalls occurred, which did not allow new micro-ops to enter the +execution pipe. Cycles when partial register stalls occurred Cycles when +flag stalls occurred Cycles floating-point unit (FPU) status word stalls +occurred. To count each of these conditions separately use the events: +RAT_STALLS.ROB_READ_PORT, RAT_STALLS.PARTIAL, RAT_STALLS.FLAGS, and +RAT_STALLS.FPSW. +.It Li SEG_RENAME_STALLS +.Pq Event D4H , Umask 01H +Counts the number of stall cycles due to the lack of renaming resources for +the ES, DS, FS, and GS segment registers. If a segment is renamed but not +retired and a second update to the same segment occurs, a stall occurs in +the front- end of the pipeline until the renamed segment retires. +.It Li ES_REG_RENAMES +.Pq Event D5H , Umask 01H +Counts the number of times the ES segment register is renamed. +.It Li UOP_UNFUSION +.Pq Event DBH , Umask 01H +Counts unfusion events due to floating point exception to a fused uop. +.It Li BR_INST_DECODED +.Pq Event E0H , Umask 01H +Counts the number of branch instructions decoded. +.It Li BPU_MISSED_CALL_RET +.Pq Event E5H , Umask 01H +Counts number of times the Branch Prediction Unit missed predicting a call +or return branch. +.It Li BACLEAR.CLEAR +.Pq Event E6H , Umask 01H +Counts the number of times the front end is resteered, mainly when the +Branch Prediction Unit cannot provide a correct prediction and this is +corrected by the Branch Address Calculator at the front end. This can occur +if the code has many branches such that they cannot be consumed by the BPU. +Each BACLEAR asserted by the BAC generates approximately an 8 cycle bubble +in the instruction fetch pipeline. The effect on total execution time +depends on the surrounding code. +.It Li BACLEAR.BAD_TARGET +.Pq Event E6H , Umask 02H +Counts number of Branch Address Calculator clears (BACLEAR) asserted due to +conditional branch instructions in which there was a target hit but the +direction was wrong. Each BACLEAR asserted by the BAC generates +approximately an 8 cycle bubble in the instruction fetch pipeline. +.It Li BPU_CLEARS.EARLY +.Pq Event E8H , Umask 01H +Counts early (normal) Branch Prediction Unit clears: BPU predicted a taken +branch after incorrectly assuming that it was not taken. +The BPU clear leads to 2 cycle bubble in the Front End. +.It Li BPU_CLEARS.LATE +.Pq Event E8H , Umask 02H +Counts late Branch Prediction Unit clears due to Most Recently Used +conflicts. The PBU clear leads to a 3 cycle bubble in the Front End. +.It Li THREAD_ACTIVE +.Pq Event ECH , Umask 01H +Counts cycles threads are active. +.It Li L2_TRANSACTIONS.LOAD +.Pq Event F0H , Umask 01H +Counts L2 load operations due to HW prefetch or demand loads. +.It Li L2_TRANSACTIONS.RFO +.Pq Event F0H , Umask 02H +Counts L2 RFO operations due to HW prefetch or demand RFOs. +.It Li L2_TRANSACTIONS.IFETCH +.Pq Event F0H , Umask 04H +Counts L2 instruction fetch operations due to HW prefetch or demand ifetch. +.It Li L2_TRANSACTIONS.PREFETCH +.Pq Event F0H , Umask 08H +Counts L2 prefetch operations. +.It Li L2_TRANSACTIONS.L1D_WB +.Pq Event F0H , Umask 10H +Counts L1D writeback operations to the L2. +.It Li L2_TRANSACTIONS.FILL +.Pq Event F0H , Umask 20H +Counts L2 cache line fill operations due to load, RFO, L1D writeback or +prefetch. +.It Li L2_TRANSACTIONS.WB +.Pq Event F0H , Umask 40H +Counts L2 writeback operations to the L3. +.It Li L2_TRANSACTIONS.ANY +.Pq Event F0H , Umask 80H +Counts all L2 cache operations. +.It Li L2_LINES_IN.S_STATE +.Pq Event F1H , Umask 02H +Counts the number of cache lines allocated in the L2 cache in the S (shared) +state. +.It Li L2_LINES_IN.E_STATE +.Pq Event F1H , Umask 04H +Counts the number of cache lines allocated in the L2 cache in the E +(exclusive) state. +.It Li L2_LINES_IN.ANY +.Pq Event F1H , Umask 07H +Counts the number of cache lines allocated in the L2 cache. +.It Li L2_LINES_OUT.DEMAND_CLEAN +.Pq Event F2H , Umask 01H +Counts L2 clean cache lines evicted by a demand request. +.It Li L2_LINES_OUT.DEMAND_DIRTY +.Pq Event F2H , Umask 02H +Counts L2 dirty (modified) cache lines evicted by a demand request. +.It Li L2_LINES_OUT.PREFETCH_CLEAN +.Pq Event F2H , Umask 04H +Counts L2 clean cache line evicted by a prefetch request. +.It Li L2_LINES_OUT.PREFETCH_DIRTY +.Pq Event F2H , Umask 08H +Counts L2 modified cache line evicted by a prefetch request. +.It Li L2_LINES_OUT.ANY +.Pq Event F2H , Umask 0FH +Counts all L2 cache lines evicted for any reason. +.It Li SQ_MISC.LRU_HINTS +.Pq Event F4H , Umask 04H +Counts number of Super Queue LRU hints sent to L3. +.It Li SQ_MISC.SPLIT_LOCK +.Pq Event F4H , Umask 10H +Counts the number of SQ lock splits across a cache line. +.It Li SQ_FULL_STALL_CYCLES +.Pq Event F6H , Umask 01H +Counts cycles the Super Queue is full. Neither of the threads on this core +will be able to access the uncore. +.It Li FP_ASSIST.ALL +.Pq Event F7H , Umask 01H +Counts the number of floating point operations executed that required +micro-code assist intervention. Assists are required in the following cases: +SSE instructions, (Denormal input when the DAZ flag is off or Underflow +result when the FTZ flag is off): x87 instructions, (NaN or denormal are +loaded to a register or used as input from memory, Division by 0 or +Underflow output). +.It Li FP_ASSIST.OUTPUT +.Pq Event F7H , Umask 02H +Counts number of floating point micro-code assist when the output value +(destination register) is invalid. +.It Li FP_ASSIST.INPUT +.Pq Event F7H , Umask 04H +Counts number of floating point micro-code assist when the input value (one +of the source operands to an FP instruction) is invalid. +.It Li SIMD_INT_64.PACKED_MPY +.Pq Event FDH , Umask 01H +Counts number of SID integer 64 bit packed multiply operations. +.It Li SIMD_INT_64.PACKED_SHIFT +.Pq Event FDH , Umask 02H +Counts number of SID integer 64 bit packed shift operations. +.It Li SIMD_INT_64.PACK +.Pq Event FDH , Umask 04H +Counts number of SID integer 64 bit pack operations. +.It Li SIMD_INT_64.UNPACK +.Pq Event FDH , Umask 08H +Counts number of SID integer 64 bit unpack operations. +.It Li SIMD_INT_64.PACKED_LOGICAL +.Pq Event FDH , Umask 10H +Counts number of SID integer 64 bit logical operations. +.It Li SIMD_INT_64.PACKED_ARITH +.Pq Event FDH , Umask 20H +Counts number of SID integer 64 bit arithmetic operations. +.It Li SIMD_INT_64.SHUFFLE_MOVE +.Pq Event FDH , Umask 40H +Counts number of SID integer 64 bit shift or move operations. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.iaf 3 , +.Xr pmc.ucf 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmc.corei7 3 , +.Xr pmc.corei7uc 3 , +.Xr pmc.westmereuc 3 , +.Xr pmc.tsc 3 , +.Xr pmc_cpuinfo 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . diff --git a/lib/libpmc/pmc.westmereuc.3 b/lib/libpmc/pmc.westmereuc.3 new file mode 100644 index 0000000..c768daa --- /dev/null +++ b/lib/libpmc/pmc.westmereuc.3 @@ -0,0 +1,1083 @@ +.\" Copyright (c) 2010 Fabien Thomas. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd March 24, 2010 +.Dt PMC.WESTMEREUC 3 +.Os +.Sh NAME +.Nm pmc.westmere +.Nd uncore measurement events for +.Tn Intel +.Tn Westmere +family CPUs +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +.Tn Intel +.Tn "Westmere" +CPUs contain PMCs conforming to version 2 of the +.Tn Intel +performance measurement architecture. +These CPUs contain two classes of PMCs: +.Bl -tag -width "Li PMC_CLASS_UCP" +.It Li PMC_CLASS_UCF +Fixed-function counters that count only one hardware event per counter. +.It Li PMC_CLASS_UCP +Programmable counters that may be configured to count one of a defined +set of hardware events. +.El +.Pp +The number of PMCs available in each class and their widths need to be +determined at run time by calling +.Xr pmc_cpuinfo 3 . +.Pp +Intel Westmere PMCs are documented in +.Rs +.%B "Intel(R) 64 and IA-32 Architectures Software Developes Manual" +.%T "Volume 3B: System Programming Guide, Part 2" +.%N "Order Number: 253669-033US" +.%D December 2009 +.%Q "Intel Corporation" +.Re +.Ss WESTMERE UNCORE FIXED FUNCTION PMCS +These PMCs and their supported events are documented in +.Xr pmc.ucf 3 . +Not all CPUs in this family implement fixed-function counters. +.Ss WESTMERE UNCORE PROGRAMMABLE PMCS +The programmable PMCs support the following capabilities: +.Bl -column "PMC_CAP_INTERRUPT" "Support" +.It Em Capability Ta Em Support +.It PMC_CAP_CASCADE Ta \&No +.It PMC_CAP_EDGE Ta Yes +.It PMC_CAP_INTERRUPT Ta \&No +.It PMC_CAP_INVERT Ta Yes +.It PMC_CAP_READ Ta Yes +.It PMC_CAP_PRECISE Ta \&No +.It PMC_CAP_SYSTEM Ta \&No +.It PMC_CAP_TAGGING Ta \&No +.It PMC_CAP_THRESHOLD Ta Yes +.It PMC_CAP_USER Ta \&No +.It PMC_CAP_WRITE Ta Yes +.El +.Ss Event Qualifiers +Event specifiers for these PMCs support the following common +qualifiers: +.Bl -tag -width indent +.It Li cmask= Ns Ar value +Configure the PMC to increment only if the number of configured +events measured in a cycle is greater than or equal to +.Ar value . +.It Li edge +Configure the PMC to count the number of de-asserted to asserted +transitions of the conditions expressed by the other qualifiers. +If specified, the counter will increment only once whenever a +condition becomes true, irrespective of the number of clocks during +which the condition remains true. +.It Li inv +Invert the sense of comparison when the +.Dq Li cmask +qualifier is present, making the counter increment when the number of +events per cycle is less than the value specified by the +.Dq Li cmask +qualifier. +.El +.Ss Event Specifiers (Programmable PMCs) +Westmere uncore programmable PMCs support the following events: +.Bl -tag -width indent +.It Li GQ_CYCLES_FULL.READ_TRACKER +.Pq Event 00H , Umask 01H +Uncore cycles Global Queue read tracker is full. +.It Li GQ_CYCLES_FULL.WRITE_TRACKER +.Pq Event 00H , Umask 02H +Uncore cycles Global Queue write tracker is full. +.It Li GQ_CYCLES_FULL.PEER_PROBE_TRACKER +.Pq Event 00H , Umask 04H +Uncore cycles Global Queue peer probe tracker is full. The peer probe +tracker queue tracks snoops from the IOH and remote sockets. +.It Li GQ_CYCLES_NOT_EMPTY.READ_TRACKER +.Pq Event 01H , Umask 01H +Uncore cycles were Global Queue read tracker has at least one valid entry. +.It Li GQ_CYCLES_NOT_EMPTY.WRITE_TRACKER +.Pq Event 01H , Umask 02H +Uncore cycles were Global Queue write tracker has at least one valid entry. +.It Li GQ_CYCLES_NOT_EMPTY.PEER_PROBE_TRACKER +.Pq Event 01H , Umask 04H +Uncore cycles were Global Queue peer probe tracker has at least one valid +entry. The peer probe tracker queue tracks IOH and remote socket snoops. +.It Li GQ_OCCUPANCY.READ_TRACKER +.Pq Event 02H , Umask 01H +Increments the number of queue entries (code read, data read, and RFOs) in +the tread tracker. The GQ read tracker allocate to deallocate occupancy +count is divided by the count to obtain the average read tracker latency. +.It Li GQ_ALLOC.READ_TRACKER +.Pq Event 03H , Umask 01H +Counts the number of tread tracker allocate to deallocate entries. The GQ +read tracker allocate to deallocate occupancy count is divided by the count +to obtain the average read tracker latency. +.It Li GQ_ALLOC.RT_L3_MISS +.Pq Event 03H , Umask 02H +Counts the number GQ read tracker entries for which a full cache line read +has missed the L3. The GQ read tracker L3 miss to fill occupancy count is +divided by this count to obtain the average cache line read L3 miss latency. +The latency represents the time after which the L3 has determined that the +cache line has missed. The time between a GQ read tracker allocation and the +L3 determining that the cache line has missed is the average L3 hit latency. +The total L3 cache line read miss latency is the hit latency + L3 miss +latency. +.It Li GQ_ALLOC.RT_TO_L3_RESP +.Pq Event 03H , Umask 04H +Counts the number of GQ read tracker entries that are allocated in the read +tracker queue that hit or miss the L3. The GQ read tracker L3 hit occupancy +count is divided by this count to obtain the average L3 hit latency. +.It Li GQ_ALLOC.RT_TO_RTID_ACQUIRED +.Pq Event 03H , Umask 08H +Counts the number of GQ read tracker entries that are allocated in the read +tracker, have missed in the L3 and have not acquired a Request Transaction +ID. The GQ read tracker L3 miss to RTID acquired occupancy count is +divided by this count to obtain the average latency for a read L3 miss to +acquire an RTID. +.It Li GQ_ALLOC.WT_TO_RTID_ACQUIRED +.Pq Event 03H , Umask 10H +Counts the number of GQ write tracker entries that are allocated in the +write tracker, have missed in the L3 and have not acquired a Request +Transaction ID. The GQ write tracker L3 miss to RTID occupancy count is +divided by this count to obtain the average latency for a write L3 miss to +acquire an RTID. +.It Li GQ_ALLOC.WRITE_TRACKER +.Pq Event 03H , Umask 20H +Counts the number of GQ write tracker entries that are allocated in the +write tracker queue that miss the L3. The GQ write tracker occupancy count +is divided by the this count to obtain the average L3 write miss latency. +.It Li GQ_ALLOC.PEER_PROBE_TRACKER +.Pq Event 03H , Umask 40H +Counts the number of GQ peer probe tracker (snoop) entries that are +allocated in the peer probe tracker queue that miss the L3. The GQ peer +probe occupancy count is divided by this count to obtain the average L3 peer +probe miss latency. +.It Li GQ_DATA.FROM_QPI +.Pq Event 04H , Umask 01H +Cycles Global Queue Quickpath Interface input data port is busy importing +data from the Quickpath Interface. Each cycle the input port can transfer 8 +or 16 bytes of data. +.It Li GQ_DATA.FROM_QMC +.Pq Event 04H , Umask 02H +Cycles Global Queue Quickpath Memory Interface input data port is busy +importing data from the Quickpath Memory Interface. Each cycle the input +port can transfer 8 or 16 bytes of data. +.It Li GQ_DATA.FROM_L3 +.Pq Event 04H , Umask 04H +Cycles GQ L3 input data port is busy importing data from the Last Level +Cache. Each cycle the input port can transfer 32 bytes of data. +.It Li GQ_DATA.FROM_CORES_02 +.Pq Event 04H , Umask 08H +Cycles GQ Core 0 and 2 input data port is busy importing data from processor +cores 0 and 2. Each cycle the input port can transfer 32 bytes of data. +.It Li GQ_DATA.FROM_CORES_13 +.Pq Event 04H , Umask 10H +Cycles GQ Core 1 and 3 input data port is busy importing data from processor +cores 1 and 3. Each cycle the input port can transfer 32 bytes of data. +.It Li GQ_DATA.TO_QPI_QMC +.Pq Event 05H , Umask 01H +Cycles GQ QPI and QMC output data port is busy sending data to the Quickpath +Interface or Quickpath Memory Interface. Each cycle the output port can +transfer 32 bytes of data. +.It Li GQ_DATA.TO_L3 +.Pq Event 05H , Umask 02H +Cycles GQ L3 output data port is busy sending data to the Last Level Cache. +Each cycle the output port can transfer 32 bytes of data. +.It Li GQ_DATA.TO_CORES +.Pq Event 05H , Umask 04H +Cycles GQ Core output data port is busy sending data to the Cores. Each +cycle the output port can transfer 32 bytes of data. +.It Li SNP_RESP_TO_LOCAL_HOME.I_STATE +.Pq Event 06H , Umask 01H +Number of snoop responses to the local home that L3 does not have the +referenced cache line. +.It Li SNP_RESP_TO_LOCAL_HOME.S_STATE +.Pq Event 06H , Umask 02H +Number of snoop responses to the local home that L3 has the referenced line +cached in the S state. +.It Li SNP_RESP_TO_LOCAL_HOME.FWD_S_STATE +.Pq Event 06H , Umask 04H +Number of responses to code or data read snoops to the local home that the +L3 has the referenced cache line in the E state. The L3 cache line state is +changed to the S state and the line is forwarded to the local home in the S +state. +.It Li SNP_RESP_TO_LOCAL_HOME.FWD_I_STATE +.Pq Event 06H , Umask 08H +Number of responses to read invalidate snoops to the local home that the L3 +has the referenced cache line in the M state. The L3 cache line state is +invalidated and the line is forwarded to the local home in the M state. +.It Li SNP_RESP_TO_LOCAL_HOME.CONFLICT +.Pq Event 06H , Umask 10H +Number of conflict snoop responses sent to the local home. +.It Li SNP_RESP_TO_LOCAL_HOME.WB +.Pq Event 06H , Umask 20H +Number of responses to code or data read snoops to the local home that the +L3 has the referenced line cached in the M state. +.It Li SNP_RESP_TO_REMOTE_HOME.I_STATE +.Pq Event 07H , Umask 01H +Number of snoop responses to a remote home that L3 does not have the +referenced cache line. +.It Li SNP_RESP_TO_REMOTE_HOME.S_STATE +.Pq Event 07H , Umask 02H +Number of snoop responses to a remote home that L3 has the referenced line +cached in the S state. +.It Li SNP_RESP_TO_REMOTE_HOME.FWD_S_STATE +.Pq Event 07H , Umask 04H +Number of responses to code or data read snoops to a remote home that the L3 +has the referenced cache line in the E state. The L3 cache line state is +changed to the S state and the line is forwarded to the remote home in the S +state. +.It Li SNP_RESP_TO_REMOTE_HOME.FWD_I_STATE +.Pq Event 07H , Umask 08H +Number of responses to read invalidate snoops to a remote home that the L3 +has the referenced cache line in the M state. The L3 cache line state is +invalidated and the line is forwarded to the remote home in the M state. +.It Li SNP_RESP_TO_REMOTE_HOME.CONFLICT +.Pq Event 07H , Umask 10H +Number of conflict snoop responses sent to the local home. +.It Li SNP_RESP_TO_REMOTE_HOME.WB +.Pq Event 07H , Umask 20H +Number of responses to code or data read snoops to a remote home that the L3 +has the referenced line cached in the M state. +.It Li SNP_RESP_TO_REMOTE_HOME.HITM +.Pq Event 07H , Umask 24H +Number of HITM snoop responses to a remote home +.It Li L3_HITS.READ +.Pq Event 08H , Umask 01H +Number of code read, data read and RFO requests that hit in the L3 +.It Li L3_HITS.WRITE +.Pq Event 08H , Umask 02H +Number of writeback requests that hit in the L3. Writebacks from the cores +will always result in L3 hits due to the inclusive property of the L3. +.It Li L3_HITS.PROBE +.Pq Event 08H , Umask 04H +Number of snoops from IOH or remote sockets that hit in the L3. +.It Li L3_HITS.ANY +.Pq Event 08H , Umask 03H +Number of reads and writes that hit the L3. +.It Li L3_MISS.READ +.Pq Event 09H , Umask 01H +Number of code read, data read and RFO requests that miss the L3. +.It Li L3_MISS.WRITE +.Pq Event 09H , Umask 02H +Number of writeback requests that miss the L3. Should always be zero as +writebacks from the cores will always result in L3 hits due to the inclusive +property of the L3. +.It Li L3_MISS.PROBE +.Pq Event 09H , Umask 04H +Number of snoops from IOH or remote sockets that miss the L3. +.It Li L3_MISS.ANY +.Pq Event 09H , Umask 03H +Number of reads and writes that miss the L3. +.It Li L3_LINES_IN.M_STATE +.Pq Event 0AH , Umask 01H +Counts the number of L3 lines allocated in M state. The only time a cache +line is allocated in the M state is when the line was forwarded in M state +is forwarded due to a Snoop Read Invalidate Own request. +.It Li L3_LINES_IN.E_STATE +.Pq Event 0AH , Umask 02H +Counts the number of L3 lines allocated in E state. +.It Li L3_LINES_IN.S_STATE +.Pq Event 0AH , Umask 04H +Counts the number of L3 lines allocated in S state. +.It Li L3_LINES_IN.F_STATE +.Pq Event 0AH , Umask 08H +Counts the number of L3 lines allocated in F state. +.It Li L3_LINES_IN.ANY +.Pq Event 0AH , Umask 0FH +Counts the number of L3 lines allocated in any state. +.It Li L3_LINES_OUT.M_STATE +.Pq Event 0BH , Umask 01H +Counts the number of L3 lines victimized that were in the M state. When the +victim cache line is in M state, the line is written to its home cache agent +which can be either local or remote. +.It Li L3_LINES_OUT.E_STATE +.Pq Event 0BH , Umask 02H +Counts the number of L3 lines victimized that were in the E state. +.It Li L3_LINES_OUT.S_STATE +.Pq Event 0BH , Umask 04H +Counts the number of L3 lines victimized that were in the S state. +.It Li L3_LINES_OUT.I_STATE +.Pq Event 0BH , Umask 08H +Counts the number of L3 lines victimized that were in the I state. +.It Li L3_LINES_OUT.F_STATE +.Pq Event 0BH , Umask 10H +Counts the number of L3 lines victimized that were in the F state. +.It Li L3_LINES_OUT.ANY +.Pq Event 0BH , Umask 1FH +Counts the number of L3 lines victimized in any state. +.It Li GQ_SNOOP.GOTO_S +.Pq Event 0CH , Umask 01H +Counts the number of remote snoops that have requested a cache line be set +to the S state. +.It Li GQ_SNOOP.GOTO_I +.Pq Event 0CH , Umask 02H +Counts the number of remote snoops that have requested a cache line be set +to the I state. +.It Li GQ_SNOOP.GOTO_S_HIT_E +.Pq Event 0CH , Umask 04H +Counts the number of remote snoops that have requested a cache line be set +to the S state from E state. +Requires writing MSR 301H with mask = 2H +.It Li GQ_SNOOP.GOTO_S_HIT_F +.Pq Event 0CH , Umask 04H +Counts the number of remote snoops that have requested a cache line be set +to the S state from F (forward) state. +Requires writing MSR 301H with mask = 8H +.It Li GQ_SNOOP.GOTO_S_HIT_M +.Pq Event 0CH , Umask 04H +Counts the number of remote snoops that have requested a cache line be set +to the S state from M state. +Requires writing MSR 301H with mask = 1H +.It Li GQ_SNOOP.GOTO_S_HIT_S +.Pq Event 0CH , Umask 04H +Counts the number of remote snoops that have requested a cache line be set +to the S state from S state. +Requires writing MSR 301H with mask = 4H +.It Li GQ_SNOOP.GOTO_I_HIT_E +.Pq Event 0CH , Umask 08H +Counts the number of remote snoops that have requested a cache line be set +to the I state from E state. +Requires writing MSR 301H with mask = 2H +.It Li GQ_SNOOP.GOTO_I_HIT_F +.Pq Event 0CH , Umask 08H +Counts the number of remote snoops that have requested a cache line be set +to the I state from F (forward) state. +Requires writing MSR 301H with mask = 8H +.It Li GQ_SNOOP.GOTO_I_HIT_M +.Pq Event 0CH , Umask 08H +Counts the number of remote snoops that have requested a cache line be set +to the I state from M state. +Requires writing MSR 301H with mask = 1H +.It Li GQ_SNOOP.GOTO_I_HIT_S +.Pq Event 0CH , Umask 08H +Counts the number of remote snoops that have requested a cache line be set +to the I state from S state. +Requires writing MSR 301H with mask = 4H +.It Li QHL_REQUESTS.IOH_READS +.Pq Event 20H , Umask 01H +Counts number of Quickpath Home Logic read requests from the IOH. +.It Li QHL_REQUESTS.IOH_WRITES +.Pq Event 20H , Umask 02H +Counts number of Quickpath Home Logic write requests from the IOH. +.It Li QHL_REQUESTS.REMOTE_READS +.Pq Event 20H , Umask 04H +Counts number of Quickpath Home Logic read requests from a remote socket. +.It Li QHL_REQUESTS.REMOTE_WRITES +.Pq Event 20H , Umask 08H +Counts number of Quickpath Home Logic write requests from a remote socket. +.It Li QHL_REQUESTS.LOCAL_READS +.Pq Event 20H , Umask 10H +Counts number of Quickpath Home Logic read requests from the local socket. +.It Li QHL_REQUESTS.LOCAL_WRITES +.Pq Event 20H , Umask 20H +Counts number of Quickpath Home Logic write requests from the local socket. +.It Li QHL_CYCLES_FULL.IOH +.Pq Event 21H , Umask 01H +Counts uclk cycles all entries in the Quickpath Home Logic IOH are full. +.It Li QHL_CYCLES_FULL.REMOTE +.Pq Event 21H , Umask 02H +Counts uclk cycles all entries in the Quickpath Home Logic remote tracker +are full. +.It Li QHL_CYCLES_FULL.LOCAL +.Pq Event 21H , Umask 04H +Counts uclk cycles all entries in the Quickpath Home Logic local tracker are +full. +.It Li QHL_CYCLES_NOT_EMPTY.IOH +.Pq Event 22H , Umask 01H +Counts uclk cycles all entries in the Quickpath Home Logic IOH is busy. +.It Li QHL_CYCLES_NOT_EMPTY.REMOTE +.Pq Event 22H , Umask 02H +Counts uclk cycles all entries in the Quickpath Home Logic remote tracker is +busy. +.It Li QHL_CYCLES_NOT_EMPTY.LOCAL +.Pq Event 22H , Umask 04H +Counts uclk cycles all entries in the Quickpath Home Logic local tracker is +busy. +.It Li QHL_OCCUPANCY.IOH +.Pq Event 23H , Umask 01H +QHL IOH tracker allocate to deallocate read occupancy. +.It Li QHL_OCCUPANCY.REMOTE +.Pq Event 23H , Umask 02H +QHL remote tracker allocate to deallocate read occupancy. +.It Li QHL_OCCUPANCY.LOCAL +.Pq Event 23H , Umask 04H +QHL local tracker allocate to deallocate read occupancy. +.It Li QHL_ADDRESS_CONFLICTS.2WAY +.Pq Event 24H , Umask 02H +Counts number of QHL Active Address Table (AAT) entries that saw a max of 2 +conflicts. The AAT is a structure that tracks requests that are in conflict. +The requests themselves are in the home tracker entries. The count is +reported when an AAT entry deallocates. +.It Li QHL_ADDRESS_CONFLICTS.3WAY +.Pq Event 24H , Umask 04H +Counts number of QHL Active Address Table (AAT) entries that saw a max of 3 +conflicts. The AAT is a structure that tracks requests that are in conflict. +The requests themselves are in the home tracker entries. The count is +reported when an AAT entry deallocates. +.It Li QHL_CONFLICT_CYCLES.IOH +.Pq Event 25H , Umask 01H +Counts cycles the Quickpath Home Logic IOH Tracker contains two or more +requests with an address conflict. A max of 3 requests can be in conflict. +.It Li QHL_CONFLICT_CYCLES.REMOTE +.Pq Event 25H , Umask 02H +Counts cycles the Quickpath Home Logic Remote Tracker contains two or more +requests with an address conflict. A max of 3 requests can be in conflict. +.It Li QHL_CONFLICT_CYCLES.LOCAL +.Pq Event 25H , Umask 04H +Counts cycles the Quickpath Home Logic Local Tracker contains two or more +requests with an address conflict. A max of 3 requests can be in conflict. +.It Li QHL_TO_QMC_BYPASS +.Pq Event 26H , Umask 01H +Counts number or requests to the Quickpath Memory Controller that bypass the +Quickpath Home Logic. All local accesses can be bypassed. For remote +requests, only read requests can be bypassed. +.It Li QMC_ISOC_FULL.READ.CH0 +.Pq Event 28H , Umask 01H +Counts cycles all the entries in the DRAM channel 0 high priority queue are +occupied with isochronous read requests. +.It Li QMC_ISOC_FULL.READ.CH1 +.Pq Event 28H , Umask 02H +Counts cycles all the entries in the DRAM channel 1 high priority queue are +occupied with isochronous read requests. +.It Li QMC_ISOC_FULL.READ.CH2 +.Pq Event 28H , Umask 04H +Counts cycles all the entries in the DRAM channel 2 high priority queue are +occupied with isochronous read requests. +.It Li QMC_ISOC_FULL.WRITE.CH0 +.Pq Event 28H , Umask 08H +Counts cycles all the entries in the DRAM channel 0 high priority queue are +occupied with isochronous write requests. +.It Li QMC_ISOC_FULL.WRITE.CH1 +.Pq Event 28H , Umask 10H +Counts cycles all the entries in the DRAM channel 1 high priority queue are +occupied with isochronous write requests. +.It Li QMC_ISOC_FULL.WRITE.CH2 +.Pq Event 28H , Umask 20H +Counts cycles all the entries in the DRAM channel 2 high priority queue are +occupied with isochronous write requests. +.It Li QMC_BUSY.READ.CH0 +.Pq Event 29H , Umask 01H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +read request to DRAM channel 0. +.It Li QMC_BUSY.READ.CH1 +.Pq Event 29H , Umask 02H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +read request to DRAM channel 1. +.It Li QMC_BUSY.READ.CH2 +.Pq Event 29H , Umask 04H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +read request to DRAM channel 2. +.It Li QMC_BUSY.WRITE.CH0 +.Pq Event 29H , Umask 08H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +write request to DRAM channel 0. +.It Li QMC_BUSY.WRITE.CH1 +.Pq Event 29H , Umask 10H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +write request to DRAM channel 1. +.It Li QMC_BUSY.WRITE.CH2 +.Pq Event 29H , Umask 20H +Counts cycles where Quickpath Memory Controller has at least 1 outstanding +write request to DRAM channel 2. +.It Li QMC_OCCUPANCY.CH0 +.Pq Event 2AH , Umask 01H +IMC channel 0 normal read request occupancy. +.It Li QMC_OCCUPANCY.CH1 +.Pq Event 2AH , Umask 02H +IMC channel 1 normal read request occupancy. +.It Li QMC_OCCUPANCY.CH2 +.Pq Event 2AH , Umask 04H +IMC channel 2 normal read request occupancy. +.It Li QMC_OCCUPANCY.ANY +.Pq Event 2AH , Umask 07H +Normal read request occupancy for any channel. +.It Li QMC_ISSOC_OCCUPANCY.CH0 +.Pq Event 2BH , Umask 01H +IMC channel 0 issoc read request occupancy. +.It Li QMC_ISSOC_OCCUPANCY.CH1 +.Pq Event 2BH , Umask 02H +IMC channel 1 issoc read request occupancy. +.It Li QMC_ISSOC_OCCUPANCY.CH2 +.Pq Event 2BH , Umask 04H +IMC channel 2 issoc read request occupancy. +.It Li QMC_ISSOC_READS.ANY +.Pq Event 2BH , Umask 07H +IMC issoc read request occupancy. +.It Li QMC_NORMAL_READS.CH0 +.Pq Event 2CH , Umask 01H +Counts the number of Quickpath Memory Controller channel 0 medium and low +priority read requests. The QMC channel 0 normal read occupancy divided by +this count provides the average QMC channel 0 read latency. +.It Li QMC_NORMAL_READS.CH1 +.Pq Event 2CH , Umask 02H +Counts the number of Quickpath Memory Controller channel 1 medium and low +priority read requests. The QMC channel 1 normal read occupancy divided by +this count provides the average QMC channel 1 read latency. +.It Li QMC_NORMAL_READS.CH2 +.Pq Event 2CH , Umask 04H +Counts the number of Quickpath Memory Controller channel 2 medium and low +priority read requests. The QMC channel 2 normal read occupancy divided by +this count provides the average QMC channel 2 read latency. +.It Li QMC_NORMAL_READS.ANY +.Pq Event 2CH , Umask 07H +Counts the number of Quickpath Memory Controller medium and low priority +read requests. The QMC normal read occupancy divided by this count provides +the average QMC read latency. +.It Li QMC_HIGH_PRIORITY_READS.CH0 +.Pq Event 2DH , Umask 01H +Counts the number of Quickpath Memory Controller channel 0 high priority +isochronous read requests. +.It Li QMC_HIGH_PRIORITY_READS.CH1 +.Pq Event 2DH , Umask 02H +Counts the number of Quickpath Memory Controller channel 1 high priority +isochronous read requests. +.It Li QMC_HIGH_PRIORITY_READS.CH2 +.Pq Event 2DH , Umask 04H +Counts the number of Quickpath Memory Controller channel 2 high priority +isochronous read requests. +.It Li QMC_HIGH_PRIORITY_READS.ANY +.Pq Event 2DH , Umask 07H +Counts the number of Quickpath Memory Controller high priority isochronous +read requests. +.It Li QMC_CRITICAL_PRIORITY_READS.CH0 +.Pq Event 2EH , Umask 01H +Counts the number of Quickpath Memory Controller channel 0 critical priority +isochronous read requests. +.It Li QMC_CRITICAL_PRIORITY_READS.CH1 +.Pq Event 2EH , Umask 02H +Counts the number of Quickpath Memory Controller channel 1 critical priority +isochronous read requests. +.It Li QMC_CRITICAL_PRIORITY_READS.CH2 +.Pq Event 2EH , Umask 04H +Counts the number of Quickpath Memory Controller channel 2 critical priority +isochronous read requests. +.It Li QMC_CRITICAL_PRIORITY_READS.ANY +.Pq Event 2EH , Umask 07H +Counts the number of Quickpath Memory Controller critical priority +isochronous read requests. +.It Li QMC_WRITES.FULL.CH0 +.Pq Event 2FH , Umask 01H +Counts number of full cache line writes to DRAM channel 0. +.It Li QMC_WRITES.FULL.CH1 +.Pq Event 2FH , Umask 02H +Counts number of full cache line writes to DRAM channel 1. +.It Li QMC_WRITES.FULL.CH2 +.Pq Event 2FH , Umask 04H +Counts number of full cache line writes to DRAM channel 2. +.It Li QMC_WRITES.FULL.ANY +.Pq Event 2FH , Umask 07H +Counts number of full cache line writes to DRAM. +.It Li QMC_WRITES.PARTIAL.CH0 +.Pq Event 2FH , Umask 08H +Counts number of partial cache line writes to DRAM channel 0. +.It Li QMC_WRITES.PARTIAL.CH1 +.Pq Event 2FH , Umask 10H +Counts number of partial cache line writes to DRAM channel 1. +.It Li QMC_WRITES.PARTIAL.CH2 +.Pq Event 2FH , Umask 20H +Counts number of partial cache line writes to DRAM channel 2. +.It Li QMC_WRITES.PARTIAL.ANY +.Pq Event 2FH , Umask 38H +Counts number of partial cache line writes to DRAM. +.It Li QMC_CANCEL.CH0 +.Pq Event 30H , Umask 01H +Counts number of DRAM channel 0 cancel requests. +.It Li QMC_CANCEL.CH1 +.Pq Event 30H , Umask 02H +Counts number of DRAM channel 1 cancel requests. +.It Li QMC_CANCEL.CH2 +.Pq Event 30H , Umask 04H +Counts number of DRAM channel 2 cancel requests. +.It Li QMC_CANCEL.ANY +.Pq Event 30H , Umask 07H +Counts number of DRAM cancel requests. +.It Li QMC_PRIORITY_UPDATES.CH0 +.Pq Event 31H , Umask 01H +Counts number of DRAM channel 0 priority updates. A priority update occurs +when an ISOC high or critical request is received by the QHL and there is a +matching request with normal priority that has already been issued to the +QMC. In this instance, the QHL will send a priority update to QMC to +expedite the request. +.It Li QMC_PRIORITY_UPDATES.CH1 +.Pq Event 31H , Umask 02H +Counts number of DRAM channel 1 priority updates. A priority update occurs +when an ISOC high or critical request is received by the QHL and there is a +matching request with normal priority that has already been issued to the +QMC. In this instance, the QHL will send a priority update to QMC to +expedite the request. +.It Li QMC_PRIORITY_UPDATES.CH2 +.Pq Event 31H , Umask 04H +Counts number of DRAM channel 2 priority updates. A priority update occurs +when an ISOC high or critical request is received by the QHL and there is a +matching request with normal priority that has already been issued to the +QMC. In this instance, the QHL will send a priority update to QMC to +expedite the request. +.It Li QMC_PRIORITY_UPDATES.ANY +.Pq Event 31H , Umask 07H +Counts number of DRAM priority updates. A priority update occurs when an +ISOC high or critical request is received by the QHL and there is a matching +request with normal priority that has already been issued to the QMC. In +this instance, the QHL will send a priority update to QMC to expedite the +request. +.It Li IMC_RETRY.CH0 +.Pq Event 32H , Umask 01H +Counts number of IMC DRAM channel 0 retries. DRAM retry only occurs when +configured in RAS mode. +.It Li IMC_RETRY.CH1 +.Pq Event 32H , Umask 02H +Counts number of IMC DRAM channel 1 retries. DRAM retry only occurs when +configured in RAS mode. +.It Li IMC_RETRY.CH2 +.Pq Event 32H , Umask 04H +Counts number of IMC DRAM channel 2 retries. DRAM retry only occurs when +configured in RAS mode. +.It Li IMC_RETRY.ANY +.Pq Event 32H , Umask 07H +Counts number of IMC DRAM retries from any channel. DRAM retry only occurs +when configured in RAS mode. +.It Li QHL_FRC_ACK_CNFLTS.IOH +.Pq Event 33H , Umask 01H +Counts number of Force Acknowledge Conflict messages sent by the Quickpath +Home Logic to the IOH. +.It Li QHL_FRC_ACK_CNFLTS.REMOTE +.Pq Event 33H , Umask 02H +Counts number of Force Acknowledge Conflict messages sent by the Quickpath +Home Logic to the remote home. +.It Li QHL_FRC_ACK_CNFLTS.LOCAL +.Pq Event 33H , Umask 04H +Counts number of Force Acknowledge Conflict messages sent by the Quickpath +Home Logic to the local home. +.It Li QHL_FRC_ACK_CNFLTS.ANY +.Pq Event 33H , Umask 07H +Counts number of Force Acknowledge Conflict messages sent by the Quickpath +Home Logic. +.It Li QHL_SLEEPS.IOH_ORDER +.Pq Event 34H , Umask 01H +Counts number of occurrences a request was put to sleep due to IOH ordering +(write after read) conflicts. While in the sleep state, the request is not +eligible to be scheduled to the QMC. +.It Li QHL_SLEEPS.REMOTE_ORDER +.Pq Event 34H , Umask 02H +Counts number of occurrences a request was put to sleep due to remote socket +ordering (write after read) conflicts. While in the sleep state, the request +is not eligible to be scheduled to the QMC. +.It Li QHL_SLEEPS.LOCAL_ORDER +.Pq Event 34H , Umask 04H +Counts number of occurrences a request was put to sleep due to local socket +ordering (write after read) conflicts. While in the sleep state, the request +is not eligible to be scheduled to the QMC. +.It Li QHL_SLEEPS.IOH_CONFLICT +.Pq Event 34H , Umask 08H +Counts number of occurrences a request was put to sleep due to IOH address +conflicts. While in the sleep state, the request is not eligible to be +scheduled to the QMC. +.It Li QHL_SLEEPS.REMOTE_CONFLICT +.Pq Event 34H , Umask 10H +Counts number of occurrences a request was put to sleep due to remote socket +address conflicts. While in the sleep state, the request is not eligible to +be scheduled to the QMC. +.It Li QHL_SLEEPS.LOCAL_CONFLICT +.Pq Event 34H , Umask 20H +Counts number of occurrences a request was put to sleep due to local socket +address conflicts. While in the sleep state, the request is not eligible to +be scheduled to the QMC. +.It Li ADDR_OPCODE_MATCH.IOH +.Pq Event 35H , Umask 01H +Counts number of requests from the IOH, address/opcode of request is +qualified by mask value written to MSR 396H. The following mask values are +supported: +0: NONE 40000000_00000000H:RSPFWDI 40001A00_00000000H:RSPFWDS +40001D00_00000000H:RSPIWB +Match opcode/address by writing MSR 396H with mask supported mask value +.It Li ADDR_OPCODE_MATCH.REMOTE +.Pq Event 35H , Umask 02H +Counts number of requests from the remote socket, address/opcode of request +is qualified by mask value written to MSR 396H. The following mask values +are supported: +0: NONE 40000000_00000000H:RSPFWDI 40001A00_00000000H:RSPFWDS +40001D00_00000000H:RSPIWB +Match opcode/address by writing MSR 396H with mask supported mask value +.It Li ADDR_OPCODE_MATCH.LOCAL +.Pq Event 35H , Umask 04H +Counts number of requests from the local socket, address/opcode of request +is qualified by mask value written to MSR 396H. The following mask values +are supported: +0: NONE 40000000_00000000H:RSPFWDI 40001A00_00000000H:RSPFWDS +40001D00_00000000H:RSPIWB +Match opcode/address by writing MSR 396H with mask supported mask value +.It Li QPI_TX_STALLED_SINGLE_FLIT.HOME.LINK_0 +.Pq Event 40H , Umask 01H +Counts cycles the Quickpath outbound link 0 HOME virtual channel is stalled +due to lack of a VNA and VN0 credit. Note that this event does not filter +out when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.SNOOP.LINK_0 +.Pq Event 40H , Umask 02H +Counts cycles the Quickpath outbound link 0 SNOOP virtual channel is stalled +due to lack of a VNA and VN0 credit. Note that this event does not filter +out when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.NDR.LINK_0 +.Pq Event 40H , Umask 04H +Counts cycles the Quickpath outbound link 0 non-data response virtual +channel is stalled due to lack of a VNA and VN0 credit. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.HOME.LINK_1 +.Pq Event 40H , Umask 08H +Counts cycles the Quickpath outbound link 1 HOME virtual channel is stalled +due to lack of a VNA and VN0 credit. Note that this event does not filter +out when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.SNOOP.LINK_1 +.Pq Event 40H , Umask 10H +Counts cycles the Quickpath outbound link 1 SNOOP virtual channel is stalled +due to lack of a VNA and VN0 credit. Note that this event does not filter +out when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.NDR.LINK_1 +.Pq Event 40H , Umask 20H +Counts cycles the Quickpath outbound link 1 non-data response virtual +channel is stalled due to lack of a VNA and VN0 credit. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.LINK_0 +.Pq Event 40H , Umask 07H +Counts cycles the Quickpath outbound link 0 virtual channels are stalled due +to lack of a VNA and VN0 credit. Note that this event does not filter out +when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_SINGLE_FLIT.LINK_1 +.Pq Event 40H , Umask 38H +Counts cycles the Quickpath outbound link 1 virtual channels are stalled due +to lack of a VNA and VN0 credit. Note that this event does not filter out +when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.DRS.LINK_0 +.Pq Event 41H , Umask 01H +Counts cycles the Quickpath outbound link 0 Data ResponSe virtual channel is +stalled due to lack of VNA and VN0 credits. Note that this event does not +filter out when a flit would not have been selected for arbitration because +another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.NCB.LINK_0 +.Pq Event 41H , Umask 02H +Counts cycles the Quickpath outbound link 0 Non-Coherent Bypass virtual +channel is stalled due to lack of VNA and VN0 credits. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.NCS.LINK_0 +.Pq Event 41H , Umask 04H +Counts cycles the Quickpath outbound link 0 Non-Coherent Standard virtual +channel is stalled due to lack of VNA and VN0 credits. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.DRS.LINK_1 +.Pq Event 41H , Umask 08H +Counts cycles the Quickpath outbound link 1 Data ResponSe virtual channel is +stalled due to lack of VNA and VN0 credits. Note that this event does not +filter out when a flit would not have been selected for arbitration because +another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.NCB.LINK_1 +.Pq Event 41H , Umask 10H +Counts cycles the Quickpath outbound link 1 Non-Coherent Bypass virtual +channel is stalled due to lack of VNA and VN0 credits. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.NCS.LINK_1 +.Pq Event 41H , Umask 20H +Counts cycles the Quickpath outbound link 1 Non-Coherent Standard virtual +channel is stalled due to lack of VNA and VN0 credits. Note that this event +does not filter out when a flit would not have been selected for arbitration +because another virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.LINK_0 +.Pq Event 41H , Umask 07H +Counts cycles the Quickpath outbound link 0 virtual channels are stalled due +to lack of VNA and VN0 credits. Note that this event does not filter out +when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_STALLED_MULTI_FLIT.LINK_1 +.Pq Event 41H , Umask 38H +Counts cycles the Quickpath outbound link 1 virtual channels are stalled due +to lack of VNA and VN0 credits. Note that this event does not filter out +when a flit would not have been selected for arbitration because another +virtual channel is getting arbitrated. +.It Li QPI_TX_HEADER.FULL.LINK_0 +.Pq Event 42H , Umask 01H +Number of cycles that the header buffer in the Quickpath Interface outbound +link 0 is full. +.It Li QPI_TX_HEADER.BUSY.LINK_0 +.Pq Event 42H , Umask 02H +Number of cycles that the header buffer in the Quickpath Interface outbound +link 0 is busy. +.It Li QPI_TX_HEADER.FULL.LINK_1 +.Pq Event 42H , Umask 04H +Number of cycles that the header buffer in the Quickpath Interface outbound +link 1 is full. +.It Li QPI_TX_HEADER.BUSY.LINK_1 +.Pq Event 42H , Umask 08H +Number of cycles that the header buffer in the Quickpath Interface outbound +link 1 is busy. +.It Li QPI_RX_NO_PPT_CREDIT.STALLS.LINK_0 +.Pq Event 43H , Umask 01H +Number of cycles that snoop packets incoming to the Quickpath Interface link +0 are stalled and not sent to the GQ because the GQ Peer Probe Tracker (PPT) +does not have any available entries. +.It Li QPI_RX_NO_PPT_CREDIT.STALLS.LINK_1 +.Pq Event 43H , Umask 02H +Number of cycles that snoop packets incoming to the Quickpath Interface link +1 are stalled and not sent to the GQ because the GQ Peer Probe Tracker (PPT) +does not have any available entries. +.It Li DRAM_OPEN.CH0 +.Pq Event 60H , Umask 01H +Counts number of DRAM Channel 0 open commands issued either for read or +write. To read or write data, the referenced DRAM page must first be opened. +.It Li DRAM_OPEN.CH1 +.Pq Event 60H , Umask 02H +Counts number of DRAM Channel 1 open commands issued either for read or +write. To read or write data, the referenced DRAM page must first be opened. +.It Li DRAM_OPEN.CH2 +.Pq Event 60H , Umask 04H +Counts number of DRAM Channel 2 open commands issued either for read or +write. To read or write data, the referenced DRAM page must first be opened. +.It Li DRAM_PAGE_CLOSE.CH0 +.Pq Event 61H , Umask 01H +DRAM channel 0 command issued to CLOSE a page due to page idle timer +expiration. Closing a page is done by issuing a precharge. +.It Li DRAM_PAGE_CLOSE.CH1 +.Pq Event 61H , Umask 02H +DRAM channel 1 command issued to CLOSE a page due to page idle timer +expiration. Closing a page is done by issuing a precharge. +.It Li DRAM_PAGE_CLOSE.CH2 +.Pq Event 61H , Umask 04H +DRAM channel 2 command issued to CLOSE a page due to page idle timer +expiration. Closing a page is done by issuing a precharge. +.It Li DRAM_PAGE_MISS.CH0 +.Pq Event 62H , Umask 01H +Counts the number of precharges (PRE) that were issued to DRAM channel 0 +because there was a page miss. A page miss refers to a situation in which a +page is currently open and another page from the same bank needs to be +opened. The new page experiences a page miss. Closing of the old page is +done by issuing a precharge. +.It Li DRAM_PAGE_MISS.CH1 +.Pq Event 62H , Umask 02H +Counts the number of precharges (PRE) that were issued to DRAM channel 1 +because there was a page miss. A page miss refers to a situation in which a +page is currently open and another page from the same bank needs to be +opened. The new page experiences a page miss. Closing of the old page is +done by issuing a precharge. +.It Li DRAM_PAGE_MISS.CH2 +.Pq Event 62H , Umask 04H +Counts the number of precharges (PRE) that were issued to DRAM channel 2 +because there was a page miss. A page miss refers to a situation in which a +page is currently open and another page from the same bank needs to be +opened. The new page experiences a page miss. Closing of the old page is +done by issuing a precharge. +.It Li DRAM_READ_CAS.CH0 +.Pq Event 63H , Umask 01H +Counts the number of times a read CAS command was issued on DRAM channel 0. +.It Li DRAM_READ_CAS.AUTOPRE_CH0 +.Pq Event 63H , Umask 02H +Counts the number of times a read CAS command was issued on DRAM channel 0 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_READ_CAS.CH1 +.Pq Event 63H , Umask 04H +Counts the number of times a read CAS command was issued on DRAM channel 1. +.It Li DRAM_READ_CAS.AUTOPRE_CH1 +.Pq Event 63H , Umask 08H +Counts the number of times a read CAS command was issued on DRAM channel 1 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_READ_CAS.CH2 +.Pq Event 63H , Umask 10H +Counts the number of times a read CAS command was issued on DRAM channel 2. +.It Li DRAM_READ_CAS.AUTOPRE_CH2 +.Pq Event 63H , Umask 20H +Counts the number of times a read CAS command was issued on DRAM channel 2 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_WRITE_CAS.CH0 +.Pq Event 64H , Umask 01H +Counts the number of times a write CAS command was issued on DRAM channel 0. +.It Li DRAM_WRITE_CAS.AUTOPRE_CH0 +.Pq Event 64H , Umask 02H +Counts the number of times a write CAS command was issued on DRAM channel 0 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_WRITE_CAS.CH1 +.Pq Event 64H , Umask 04H +Counts the number of times a write CAS command was issued on DRAM channel 1. +.It Li DRAM_WRITE_CAS.AUTOPRE_CH1 +.Pq Event 64H , Umask 08H +Counts the number of times a write CAS command was issued on DRAM channel 1 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_WRITE_CAS.CH2 +.Pq Event 64H , Umask 10H +Counts the number of times a write CAS command was issued on DRAM channel 2. +.It Li DRAM_WRITE_CAS.AUTOPRE_CH2 +.Pq Event 64H , Umask 20H +Counts the number of times a write CAS command was issued on DRAM channel 2 +where the command issued used the auto-precharge (auto page close) mode. +.It Li DRAM_REFRESH.CH0 +.Pq Event 65H , Umask 01H +Counts number of DRAM channel 0 refresh commands. DRAM loses data content +over time. In order to keep correct data content, the data values have to be +refreshed periodically. +.It Li DRAM_REFRESH.CH1 +.Pq Event 65H , Umask 02H +Counts number of DRAM channel 1 refresh commands. DRAM loses data content +over time. In order to keep correct data content, the data values have to be +refreshed periodically. +.It Li DRAM_REFRESH.CH2 +.Pq Event 65H , Umask 04H +Counts number of DRAM channel 2 refresh commands. DRAM loses data content +over time. In order to keep correct data content, the data values have to be +refreshed periodically. +.It Li DRAM_PRE_ALL.CH0 +.Pq Event 66H , Umask 01H +Counts number of DRAM Channel 0 precharge-all (PREALL) commands that close +all open pages in a rank. PREALL is issued when the DRAM needs to be +refreshed or needs to go into a power down mode. +.It Li DRAM_PRE_ALL.CH1 +.Pq Event 66H , Umask 02H +Counts number of DRAM Channel 1 precharge-all (PREALL) commands that close +all open pages in a rank. PREALL is issued when the DRAM needs to be +refreshed or needs to go into a power down mode. +.It Li DRAM_PRE_ALL.CH2 +.Pq Event 66H , Umask 04H +Counts number of DRAM Channel 2 precharge-all (PREALL) commands that close +all open pages in a rank. PREALL is issued when the DRAM needs to be +refreshed or needs to go into a power down mode. +.It Li DRAM_THERMAL_THROTTLED +.Pq Event 67H , Umask 01H +Uncore cycles DRAM was throttled due to its temperature being above the +thermal throttling threshold. +.It Li THERMAL_THROTTLING_TEMP.CORE_0 +.Pq Event 80H , Umask 01H +Cycles that the PCU records that core 0 is above the thermal throttling +threshold temperature. +.It Li THERMAL_THROTTLING_TEMP.CORE_1 +.Pq Event 80H , Umask 02H +Cycles that the PCU records that core 1 is above the thermal throttling +threshold temperature. +.It Li THERMAL_THROTTLING_TEMP.CORE_2 +.Pq Event 80H , Umask 04H +Cycles that the PCU records that core 2 is above the thermal throttling +threshold temperature. +.It Li THERMAL_THROTTLING_TEMP.CORE_3 +.Pq Event 80H , Umask 08H +Cycles that the PCU records that core 3 is above the thermal throttling +threshold temperature. +.It Li THERMAL_THROTTLED_TEMP.CORE_0 +.Pq Event 81H , Umask 01H +Cycles that the PCU records that core 0 is in the power throttled state due +to cores temperature being above the thermal throttling threshold. +.It Li THERMAL_THROTTLED_TEMP.CORE_1 +.Pq Event 81H , Umask 02H +Cycles that the PCU records that core 1 is in the power throttled state due +to cores temperature being above the thermal throttling threshold. +.It Li THERMAL_THROTTLED_TEMP.CORE_2 +.Pq Event 81H , Umask 04H +Cycles that the PCU records that core 2 is in the power throttled state due +to cores temperature being above the thermal throttling threshold. +.It Li THERMAL_THROTTLED_TEMP.CORE_3 +.Pq Event 81H , Umask 08H +Cycles that the PCU records that core 3 is in the power throttled state due +to cores temperature being above the thermal throttling threshold. +.It Li PROCHOT_ASSERTION +.Pq Event 82H , Umask 01H +Number of system assertions of PROCHOT indicating the entire processor has +exceeded the thermal limit. +.It Li THERMAL_THROTTLING_PROCHOT.CORE_0 +.Pq Event 83H , Umask 01H +Cycles that the PCU records that core 0 is a low power state due to the +system asserting PROCHOT the entire processor has exceeded the thermal +limit. +.It Li THERMAL_THROTTLING_PROCHOT.CORE_1 +.Pq Event 83H , Umask 02H +Cycles that the PCU records that core 1 is a low power state due to the +system asserting PROCHOT the entire processor has exceeded the thermal +limit. +.It Li THERMAL_THROTTLING_PROCHOT.CORE_2 +.Pq Event 83H , Umask 04H +Cycles that the PCU records that core 2 is a low power state due to the +system asserting PROCHOT the entire processor has exceeded the thermal +limit. +.It Li THERMAL_THROTTLING_PROCHOT.CORE_3 +.Pq Event 83H , Umask 08H +Cycles that the PCU records that core 3 is a low power state due to the +system asserting PROCHOT the entire processor has exceeded the thermal +limit. +.It Li TURBO_MODE.CORE_0 +.Pq Event 84H , Umask 01H +Uncore cycles that core 0 is operating in turbo mode. +.It Li TURBO_MODE.CORE_1 +.Pq Event 84H , Umask 02H +Uncore cycles that core 1 is operating in turbo mode. +.It Li TURBO_MODE.CORE_2 +.Pq Event 84H , Umask 04H +Uncore cycles that core 2 is operating in turbo mode. +.It Li TURBO_MODE.CORE_3 +.Pq Event 84H , Umask 08H +Uncore cycles that core 3 is operating in turbo mode. +.It Li CYCLES_UNHALTED_L3_FLL_ENABLE +.Pq Event 85H , Umask 02H +Uncore cycles that at least one core is unhalted and all L3 ways are +enabled. +.It Li CYCLES_UNHALTED_L3_FLL_DISABLE +.Pq Event 86H , Umask 01H +Uncore cycles that at least one core is unhalted and all L3 ways are +disabled. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc.atom 3 , +.Xr pmc.core 3 , +.Xr pmc.iaf 3 , +.Xr pmc.ucf 3 , +.Xr pmc.k7 3 , +.Xr pmc.k8 3 , +.Xr pmc.p4 3 , +.Xr pmc.p5 3 , +.Xr pmc.p6 3 , +.Xr pmc.corei7 3 , +.Xr pmc.corei7uc 3 , +.Xr pmc.westmere 3 , +.Xr pmc.tsc 3 , +.Xr pmc_cpuinfo 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . diff --git a/lib/libpmc/pmc.xscale.3 b/lib/libpmc/pmc.xscale.3 new file mode 100644 index 0000000..ba4b6d1 --- /dev/null +++ b/lib/libpmc/pmc.xscale.3 @@ -0,0 +1,156 @@ +.\" Copyright (c) 2009, 2010 Rui Paulo. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Rui Paulo ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd December 23, 2009 +.Os +.Dt PMC.XSCALE 3 +.Sh NAME +.Nm pmc.xscale +.Nd measurement events for +.Tn Intel +.Tn XScale +family CPUs +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Sh DESCRIPTION +.Tn Intel XScale +CPUs are ARM CPUs based on the ARMv5e core. +.Pp +Second generation cores have 2 counters, while third generation cores +have 4 counters. +Third generation cores also have an increased number of PMC events. +.Pp +.Tn Intel XScale +PMCs are documented in +.Rs +.%B "3rd Generation Intel XScale Microarchitecture Developer's Manual" +.%D May 2007 +.Re +.Ss Event Specifiers (Programmable PMCs) +.Tn Intel XScale +programmable PMCs support the following events: +.Bl -tag -width indent +.It Li IC_FETCH +External memory fetch due to L1 instruction cache miss. +.It Li IC_MISS +Instruction cache or TLB miss. +.It Li DATA_DEPENDENCY_STALLED +A data dependency stalled +.It Li ITLB_MISS +Instruction TLB miss. +.It Li DTLB_MISS +Data TLB miss. +.It Li BRANCH_RETIRED +Branch instruction retired (executed). +.It Li BRANCH_MISPRED +Branch mispredicted. +.It Li INSTR_RETIRED +Instructions retired (executed). +.It Li DC_FULL_CYCLE +L1 data cache buffer full stall. +Event occurs on every cycle the +condition is present. +.It Li DC_FULL_CONTIG +L1 data cache buffer full stall. +Event occurs once for each contiguous sequence of this type of stall. +.It Li DC_ACCESS +L1 data cache access, not including cache operations. +.It Li DC_MISS +L1 data cache miss, not including cache operations. +.It Li DC_WRITEBACK +L1 data cache write-back. +Occurs for each cache line that's written back from the cache. +.It Li PC_CHANGE +Software changed the program counter. +.It Li BRANCH_RETIRED_ALL +Branch instruction retired (executed). +This event counts all branch instructions, indirect or direct. +.It Li INSTR_CYCLE +Count the number of microarchitecture cycles each instruction requires +to issue. +.It Li CP_STALL +Coprocessor stalled the instruction pipeline. +.It Li PC_CHANGE_ALL +Software changed the program counter (includes exceptions). +.It Li PIPELINE_FLUSH +Pipeline flushes due to mispredictions or exceptions. +.It Li BACKEND_STALL +Backend stalled the instruction pipeline. +.It Li MULTIPLIER_USE +Multiplier used. +.It Li MULTIPLIER_STALLED +Multiplier stalled the instruction pipeline. +.It Li DATA_CACHE_STALLED +Data cache stalled the instruction pipeline. +.It Li L2_CACHE_REQ +L2 cache request, not including cache operations. +.It Li L2_CACHE_MISS +L2 cache miss, not including cache operations. +.It Li ADDRESS_BUS_TRANS +Address bus transaction. +.It Li SELF_ADDRESS_BUS_TRANS +Self initiated address bus transaction. +.It Li DATA_BUS_TRANS +Data bus transaction. +.El +.Ss Event Name Aliases +The following table shows the mapping between the PMC-independent +aliases supported by +.Lb libpmc +and the underlying hardware events used. +.Bl -column "branch-mispredicts" "BRANCH_MISPRED" +.It Em Alias Ta Em Event Ta +.It Li branches Ta Li BRANCH_RETIRED Ta +.It Li branch-mispredicts Ta Li BRANCH_MISPRED Ta +.It Li dc-misses Ta Li DC_MISS Ta +.It Li ic-misses Ta Li IC_MISS Ta +.It Li instructions Ta Li INSTR_RETIRED Ta +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc_cpuinfo 3 , +.Xr pmclog 3 , +.Xr hwpmc 4 +.Sh HISTORY +The +.Nm pmc +library first appeared in +.Fx 6.0 . +Intel XScale support first appeared in +.Fx 9.0 . +.Sh AUTHORS +The +.Lb libpmc +library was written by +.An "Joseph Koshy" +.Aq jkoshy@FreeBSD.org . +.Pp +Intel XScale support was added by +.An "Rui Paulo" +.Aq rpaulo@FreeBSD.org . +.Sh CAVEATS +The Intel XScale code does not yet support sampling. diff --git a/lib/libpmc/pmc_allocate.3 b/lib/libpmc/pmc_allocate.3 new file mode 100644 index 0000000..6a2a6c0 --- /dev/null +++ b/lib/libpmc/pmc_allocate.3 @@ -0,0 +1,184 @@ +.\" Copyright (c) 2007-2008 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd September 22, 2008 +.Dt PMC_ALLOCATE 3 +.Os +.Sh NAME +.Nm pmc_allocate , +.Nm pmc_release +.Nd allocate and free performance monitoring counters +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Ft int +.Fo pmc_allocate +.Fa "const char *eventspecifier" +.Fa "enum pmc_mode mode" +.Fa "uint32_t flags" +.Fa "int cpu" +.Fa "pmc_id_t *pmcid" +.Fc +.Ft int +.Fn pmc_release "pmc_id_t pmc" +.Sh DESCRIPTION +Function +.Fn pmc_allocate +allocates a performance monitoring counter that measures the events +named by argument +.Fa eventspecifier , +and writes the allocated handle to the location pointed to by argument +.Fa pmcid . +.Pp +Argument +.Fa eventspecifier +comprises an PMC event name followed by an optional comma separated +list of keywords and qualifiers. +The allowed syntax for argument +.Fa eventspecifier +is processor specific and is listed in section +.Sx "EVENT SPECIFIERS" +in the +.Xr pmc 3 +manual page. +.Pp +The desired PMC mode is specified by argument +.Fa mode . +Legal values for the +.Fa mode +argument are: +.Bl -tag -width ".Dv PMC_MODE_SS" -compact +.It Dv PMC_MODE_SC +Allocate a system-scope counting PMC. +.It Dv PMC_MODE_SS +Allocate a system-scope sampling PMC. +.It Dv PMC_MODE_TC +Allocate a process-scope counting PMC. +.It Dv PMC_MODE_TS +Allocate a process-scope sampling PMC. +.El +.Pp +Mode specific modifiers may be specified using argument +.Fa flags . +The flags supported at PMC allocation time are: +.Bl -tag -width ".Dv PMC_F_LOG_PROCEXIT" -compact +.It Dv PMC_F_DESCENDANTS +For process-scope PMCs, automatically track descendants of attached +processes. +.It Dv PMC_F_LOG_PROCCSW +For process-scope counting PMCs, generate a log event at every context +switch containing the incremental number of hardware events seen +by the process during the time it was executing on the CPU. +.It Dv PMC_F_LOG_PROCEXIT +For process-scope counting PMCs, accumulate hardware events seen +when the process was executing on a CPU and generate a log event +when an attached process exits. +.El +PMCs allocated with flags +.Dv PMC_F_LOG_PROCCSW +and +.Dv PMC_F_LOG_PROCEXIT +need a log file to be configured before they are started. +.Pp +For system scope PMCs, the argument +.Fa cpu +is a non-negative value that specifies the CPU number +that the PMC is to be allocated on. +Process scope PMC allocations should specify the constant +.Dv PMC_CPU_ANY +for this argument. +.Pp +Function +.Fn pmc_release +releases the PMC denoted by argument +.Fa pmcid . +.Sh RETURN VALUES +If successful, function +.Fn pmc_allocate +sets the location specified by argument +.Fa pmcid +to the handle of the allocated PMC and returns 0. +In case of an error, the function returns -1 and sets the global +variable +.Va errno +to indicate the error. +.Pp +.Rv -std pmc_release +.Sh ERRORS +.Bl -tag -width Er +.It Bq Er EINVAL +The argument +.Fa mode +to function +.Fn pmc_allocate +had an invalid value. +.It Bq Er EINVAL +Argument +.Fa cpu +to function +.Fn pmc_allocate +had an invalid CPU number. +.It Bq Er EINVAL +Argument +.Fa flags +contained flags that were unsupported or otherwise incompatible with +the requested PMC mode. +.It Bq Er EINVAL +Argument +.Fa eventspecifier +to function +.Fn pmc_allocate +specified an event not supported by hardware or contained a syntax +error. +.It Bq Er ENXIO +Function +.Fn pmc_allocate +requested the use of a hardware resource that was absent or +administratively disabled. +.It Bq Er EOPNOTSUPP +The underlying hardware does not support the capabilities needed for +a PMC being allocated by a call to +.Fn pmc_allocate . +.It Bq Er EPERM +A system scope PMC allocation was attempted without adequate process +privilege. +.It Bq Er ESRCH +Function +.Fn pmc_release +was called without first having allocated a PMC. +.It Bq Er EINVAL +Argument +.Fa pmcid +to function +.Fn pmc_release +did not specify a PMC previously allocated by this process. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc_attach 3 , +.Xr pmc_configure_logfile 3 , +.Xr pmc_start 3 , +.Xr hwpmc 4 diff --git a/lib/libpmc/pmc_attach.3 b/lib/libpmc/pmc_attach.3 new file mode 100644 index 0000000..ca72511 --- /dev/null +++ b/lib/libpmc/pmc_attach.3 @@ -0,0 +1,149 @@ +.\" Copyright (c) 2007 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd November 25, 2007 +.Dt PMC_ATTACH 3 +.Os +.Sh NAME +.Nm pmc_attach , +.Nm pmc_detach +.Nd attaching and detaching process scope PMCs to target processes +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Ft int +.Fn pmc_attach "pmc_id_t pmcid" "pid_t pid" +.Ft int +.Fn pmc_detach "pmc_id_t pmcid" "pid_t pid" +.Sh DESCRIPTION +These functions control the set of target processes tracked by a +process scope PMC. +.Pp +Function +.Fn pmc_attach +is used to attach a process scope PMC specified by argument +.Fa pmcid +to a target process specified by argument +.Fa pid . +Argument +.Fa pid +may be zero to denote the current process. +If the PMC was allocated with modifier +.Dv PMC_F_DESCENDANTS , +the PMC will additionally attach to current and future descendents of +the specified target process. +The PMC should be in a quiescent state (i.e., not running). +.Pp +Function +.Fn pmc_detach +is used to detach a process scope PMC specified by argument +.Fa pmcid +from a process specified by argument +.Fa pid . +Argument +.Fa pid +may be zero to denote the current process. +.Sh RETURN VALUES +.Rv -std +.Sh ERRORS +A call to function +.Fn pmc_attach +may fail with the following errors: +.Bl -tag -width Er +.It Bq Er EBUSY +Argument +.Fa pmcid +specified a PMC that was not in a quiescent state. +.It Bq Er EBUSY +The target process specified by function +.Fa pmc_attach +is being tracked by another process scope PMC that uses the same PMC +hardware resources. +.It Bq Er EEXIST +The target process is already being tracked by the specified PMC. +.It Bq Er EINVAL +Argument +.Fa pmcid +specified a PMC with system scope. +.It Bq Er EINVAL +Argument +.Fa pid +specified an illegal process id. +.It Bq Er EINVAL +The current process does not own a PMC with the handle specified in +argument +.Fa pmcid . +.It Bq Er EPERM +The caller lacked the privilege needed to attach PMCs to +the specified target process. +.It Bq Er EPERM +(i386 and amd64 architectures) The PMC specified by argument +.Fa pmcid +has been setup to allow the use of the RDPMC instruction for +self measurement. +.It Bq Er ESRCH +The current process does not own any PMCs. +.It Bq Er ESRCH +The process specified by argument +.Fa pid +did not exist. +.El +.Pp +A call to function +.Fn pmc_detach +may fail with the following errors: +.Bl -tag -width Er +.It Bq Er EINVAL +Argument +.Fa pmcid +specified a PMC with system scope. +.It Bq Er EINVAL +Argument +.Fa pid +specified an illegal process id. +.It Bq Er EINVAL +The current process does not own a PMC with the handle specified in +argument +.Fa pmcid . +.It Bq Er EINVAL +The specified PMC was not attached to the target process. +.It Bq Er ESRCH +The current process does not own any PMCs. +.It Bq Er ESRCH +The process specified by argument +.Fa pid +is not being monitored by +.Xr hwpmc 4 . +.It Bq Er ESRCH +The process specified by argument +.Fa pid +did not exist. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc_start 3 , +.Xr pmc_stop 3 , +.Xr hwpmc 4 diff --git a/lib/libpmc/pmc_capabilities.3 b/lib/libpmc/pmc_capabilities.3 new file mode 100644 index 0000000..6aee17f --- /dev/null +++ b/lib/libpmc/pmc_capabilities.3 @@ -0,0 +1,230 @@ +.\" Copyright (c) 2007-2008 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd September 22, 2008 +.Dt PMC_CAPABILITIES 3 +.Os +.Sh NAME +.Nm pmc_capabilities , +.Nm pmc_cpuinfo , +.Nm pmc_ncpu , +.Nm pmc_npmc , +.Nm pmc_pmcinfo , +.Nm pmc_width +.Nd retrieve information about performance monitoring counters +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Ft int +.Fn pmc_capabilities "pmc_id_t pmc" "uint32_t *caps" +.Ft int +.Fn pmc_cpuinfo "const struct pmc_cpuinfo **cpu_info" +.Ft int +.Fn pmc_ncpu void +.Ft int +.Fn pmc_npmc "int cpu" +.Ft int +.Fn pmc_pmcinfo "int cpu" "struct pmc_pmcinfo **pmc_info" +.Ft int +.Fn pmc_width "pmc_id_t pmc" "uint32_t *width" +.Sh DESCRIPTION +These functions retrieve information about performance monitoring +hardware. +.Pp +Function +.Fn pmc_capabilities +retrieves the hardware capabilities of a PMC. +Argument +.Fa pmc +is a PMC handle obtained by a prior call to +.Fn pmc_allocate . +The function sets argument +.Fa caps +to a bit mask of capabilities supported by the PMC denoted by +argument +.Fa pmc . +PMC capabilities are described in +.Xr pmc 3 . +.Pp +Function +.Fn pmc_cpuinfo +retrieves information about the CPUs in the system. +Argument +.Fa cpu_info +will be set to point to an internal structure with information about +the system's CPUs. +The caller should not free this pointer value. +This structure has the following fields: +.Bl -tag -width "pm_classes" -offset indent -compact +.It pm_cputype +Specifies the CPU type. +.It pm_ncpu +Specifies the number of CPUs in the system. +.It pm_npmc +Specifies the number of PMC rows per CPU. +.It pm_nclass +Specifies the number of distinct classes of PMCs in the system. +.It pm_classes +Contains an array of +.Vt "struct pmc_classinfo" +descriptors describing the properties of each class of PMCs +in the system. +.El +.Pp +Function +.Fn pmc_ncpu +is a convenience function that returns the maximum CPU number in +the system. +On systems that support sparsely numbered CPUs, not all CPUs may +be physically present. +Applications need to be prepared to deal with nonexistent CPUs. +.Pp +Function +.Fn pmc_npmc +is a convenience function that returns the number of PMCs available +in the CPU specified by argument +.Fa cpu . +.Pp +Function +.Fn pmc_pmcinfo +returns information about the current state of the PMC hardware +in the CPU specified by argument +.Fa cpu . +The location specified by argument +.Fa pmc_info +is set to point an array of +.Vt "struct pmc_info" +structures each describing the state of one PMC in the CPU. +These structure contain the following fields: +.Bl -tag -width pm_ownerpid -offset indent -compact +.It pm_name +A human readable name for the PMC. +.It pm_class +The PMC class for the PMC. +.It pm_enabled +Non-zero if the PMC is enabled. +.It pm_rowdisp +The disposition of the PMC row for this PMC. +Row dispositions are documented in +.Xr hwpmc 4 . +.It pm_ownerpid +If the hardware is in use, the process id of the owner of the PMC. +.It pm_mode +The PMC mode as described in +.Xr pmc 3 . +.It pm_event +If the hardware is in use, the PMC event being measured. +.It pm_flags +If the hardware is in use, the flags associated with the PMC. +.It pm_reloadcount +For sampling PMCs, the reload count associated with the PMC. +.El +.Pp +Function +.Fn pmc_width +is used to retrieve the width in bits of the hardware counters +associated with a PMC. +Argument +.Fa pmc +is a PMC handle obtained by a prior call to +.Fn pmc_allocate . +The function sets the location pointed to by argument +.Fa width +to the width of the physical counters associated with PMC +.Fa pmc . +.Sh RETURN VALUES +Functions +.Fn pmc_ncpu +and +.Fn pmc_npmc +returns a positive integer if successful; otherwise the value -1 is +returned and the global variable +.Va errno +is set to indicate the error. +.Pp +Functions +.Fn pmc_capabilities , +.Fn pmc_cpuinfo , +.Fn pmc_pmcinfo +and +.Fn pmc_width +return 0 if successful; otherwise the value -1 is returned and the +global variable +.Va errno +is set to indicate the error. +.Sh ERRORS +A call to function +.Fn pmc_capabilities +may fail with the following errors: +.Bl -tag -width Er +.It Bq Er EINVAL +The argument to the function was invalid. +.El +.Pp +Calls to functions +.Fn pmc_cpuinfo , +.Fn pmc_ncpu +and +.Fn pmc_npmc +may fail with the following errors: +.Bl -tag -width Er +.It Bq Er ENXIO +A prior call to +.Fn pmc_init +to initialize the PMC library had failed. +.El +.Pp +A call to function +.Fn pmc_pmcinfo +may fail with the following errors: +.Bl -tag -width Er +.It Bq Er EINVAL +The argument +.Fa cpu +was invalid. +.It Bq Er ENXIO +The argument +.Fa cpu +specified a disabled or absent CPU. +.El +.Pp +A call to function +.Fn pmc_width +may fail with the following errors: +.Bl -tag -width Er +.It Bq Er EINVAL +The argument to the function was invalid. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc_allocate 3 , +.Xr pmc_get_driver_stats 3 , +.Xr pmc_name_of_capability 3 , +.Xr pmc_name_of_cputype 3 , +.Xr pmc_name_of_class 3 , +.Xr pmc_name_of_event 3 , +.Xr pmc_name_of_mode 3 , +.Xr hwpmc 4 diff --git a/lib/libpmc/pmc_configure_logfile.3 b/lib/libpmc/pmc_configure_logfile.3 new file mode 100644 index 0000000..a33688c --- /dev/null +++ b/lib/libpmc/pmc_configure_logfile.3 @@ -0,0 +1,124 @@ +.\" Copyright (c) 2007 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd November 24, 2007 +.Dt PMC_CONFIGURE_LOGFILE 3 +.Os +.Sh NAME +.Nm pmc_configure_logfile , +.Nm pmc_flush_logfile , +.Nm pmc_writelog +.Nd log file management +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Ft int +.Fn pmc_configure_logfile "int fd" +.Ft int +.Fn pmc_flush_logfile void +.Ft int +.Fn pmc_writelog "uint32_t userdata" +.Sh DESCRIPTION +The functions manage logging of +.Xr hwpmc 4 +events. +.Pp +Function +.Fn pmc_configure_logfile +is used to turn on and turn off logging. +If argument +.Fa fd +is a valid file handle returned by a prior call to +.Xr open 2 +or +.Xr socket 2 +then performance events will be logged to the file corresponding +to the specified handle. +If the value of argument +.Fa fd +is -1 then logging will be stopped after any pending data is flushed. +.Pp +Function +.Fn pmc_flush_logfile +will force all log data queued inside the +.Xr hwpmc 4 +driver to be written out. +.Pp +Function +.Fn pmc_writelog +will append a log entry containing the value of argument +.Fa userdata +to the log file. +.Sh RETURN VALUES +.Rv -std +.Sh ERRORS +A call to +.Fn pmc_configure_logfile +may fail with the following errors: +.Bl -tag -width Er +.It Bq Er EAGAIN +The +.Xr hwpmc 4 +driver was not able to create a helper process due to system limits +being reached. +.It Bq Er EBUSY +Function +.Fn pmc_configure_logfile +was called with a log file already configured. +.It Bq Er EINVAL +Function +.Fn pmc_configure_logfile +was called with an argument of -1 without a log file being previously +configured. +.It Bq Er ENOMEM +The system encountered a memory shortage when servicing this request. +.El +.Pp +A call to +.Fn pmc_flush_logfile +may fail with the following errors: +.Bl -tag -width Er +.It Bq Er EINVAL +Function +.Fn pmc_flush_logfile +was called without a log file being previously configured. +.El +.Pp +A call to +.Fn pmc_writelog +may fail with the following errors: +.Bl -tag -width Er +.It Bq Er EINVAL +Function +.Fn pmc_writelog +was called without a log file being previously configured. +.It Bq Er ENOMEM +The system encountered a memory shortage when servicing this +request. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr hwpmc 4 diff --git a/lib/libpmc/pmc_disable.3 b/lib/libpmc/pmc_disable.3 new file mode 100644 index 0000000..a6902ff --- /dev/null +++ b/lib/libpmc/pmc_disable.3 @@ -0,0 +1,99 @@ +.\" Copyright (c) 2007-2008 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd September 22, 2008 +.Dt PMC_ENABLE 3 +.Os +.Sh NAME +.Nm pmc_disable , +.Nm pmc_enable +.Nd administrative control of hardware performance counters +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Ft int +.Fn pmc_disable "int cpu" "int pmc" +.Ft int +.Fn pmc_enable "int cpu" "int pmc" +.Sh DESCRIPTION +These functions allow specific hardware performance monitoring +counters in a system to be disabled and enabled administratively. +The hardware performance counters available on each CPU are numbered +using small non-negative integers, in a system dependent manner. +Disabled counters will not be available to applications for use. +.Pp +The invoking process needs to have the +.Dv PRIV_PMC_MANAGE +privilege to perform these operations. +.Pp +Function +.Fn pmc_disable +disables the hardware counter numbered by argument +.Fa pmc +on CPU number +.Fa cpu . +.Pp +Function +.Fn pmc_enable +enables the hardware counter numbered by argument +.Fa pmc +on CPU number +.Fa cpu . +.Sh IMPLEMENTATION NOTES +Hardware PMCs that are currently in use by applications cannot be +disabled. +Allocation of a process scope software PMC marks all +hardware PMCs in the system with the same pmc number as being in-use. +.Sh RETURN VALUES +.Rv -std +.Sh ERRORS +A call to these functions may fail with the following errors: +.Bl -tag -width Er +.It Bq Er EBUSY +Function +.Fn pmc_disable +specified a hardware PMC is currently in use. +.It Bq Er EINVAL +Arguments +.Fa cpu +or +.Fa pmc +were invalid. +.It Bq Er ENXIO +Argument +.Fa cpu +specified a disabled or absent CPU. +.It Bq Er EPERM +The current process lacks sufficient privilege to perform this +operation. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc_cpuinfo 3 , +.Xr pmc_pmcinfo 3 , +.Xr hwpmc 4 , +.Xr pmccontrol 8 , +.Xr priv_check 9 diff --git a/lib/libpmc/pmc_event_names_of_class.3 b/lib/libpmc/pmc_event_names_of_class.3 new file mode 100644 index 0000000..183f03f --- /dev/null +++ b/lib/libpmc/pmc_event_names_of_class.3 @@ -0,0 +1,75 @@ +.\" Copyright (c) 2007 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd November 23, 2007 +.Dt PMC_EVENT_NAMES_OF_CLASS 3 +.Os +.Sh NAME +.Nm pmc_event_names_of_class +.Nd return a list of event names supported by a PMC class. +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Ft int +.Fo pmc_event_names_of_class +.Fa "enum pmc_class cl" +.Fa "const char ***eventnames" +.Fa "int *nevents" +.Fc +.Sh DESCRIPTION +Function +.Fn pmc_event_names_of_class +retrieves the hardware event names supported by the class of PMC hardware +specified by argument +.Fa cl . +.Pp +It returns an array of +.Vt "const char *" +pointers to names of events supported by the specified class of PMC +hardware. +The location pointed to by argument +.Fa nevents +is set to the number of event names returned. +.Pp +The returned array is allocated using +.Xr malloc 3 . +.Sh RETURN VALUES +.Rv -std pmc_event_names_of_class +.Sh ERRORS +A call to +.Fn pmc_event_names_of_class +may fail with the following errors: +.Bl -tag -width Er +.It Bq Er EINVAL +Argument +.Fa cl +was invalid. +.It Bq Er ENOMEM +Allocation of a memory area to hold the result failed. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr hwpmc 4 diff --git a/lib/libpmc/pmc_get_driver_stats.3 b/lib/libpmc/pmc_get_driver_stats.3 new file mode 100644 index 0000000..fa214b3 --- /dev/null +++ b/lib/libpmc/pmc_get_driver_stats.3 @@ -0,0 +1,73 @@ +.\" Copyright (c) 2007 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd November 25, 2007 +.Dt PMC_GET_DRIVER_STATS 3 +.Os +.Sh NAME +.Nm pmc_get_driver_stats +.Nd retrieve driver statistics +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Ft int +.Fn pmc_get_driver_stats "struct pmc_driverstats *gms" +.Sh DESCRIPTION +The function +.Fn pmc_get_driver_statistics +retrieves a snapshot of the usage statistics maintained by +.Xr hwpmc 4 +into the memory area pointed to by argument +.Fa gms . +.Pp +The returned structure includes the following fields: +.Bl -tag -width pmc_intr_bufferfull -offset indent -compact +.It pm_intr_ignored +The number of sampling interrupts ignored. +.It pm_intr_processed +The number of sampling interrupts processed. +.It pm_intr_bufferfull +The number of sampling interrupts dropped due to lack of space +in the sample buffer. +.It pm_syscalls +The number of system calls into +.Xr hwpmc 4 . +.It pm_syscalls_errors +The number of system calls into +.Xr hwpmc 4 +that failed. +.It pm_buffer_requests +The number of log buffer requests so far. +.It pm_buffer_requests_failed +The number of log buffer requests that failed due to lack of buffers. +.It pm_log_sweeps +The number of sample buffer processing sweeps. +.El +.Sh RETURN VALUES +.Rv -std +.Sh SEE ALSO +.Xr pmc 3 , +.Xr hwpmc 4 diff --git a/lib/libpmc/pmc_get_msr.3 b/lib/libpmc/pmc_get_msr.3 new file mode 100644 index 0000000..6361d3a --- /dev/null +++ b/lib/libpmc/pmc_get_msr.3 @@ -0,0 +1,76 @@ +.\" Copyright (c) 2007 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd November 25, 2007 +.Dt PMC_GET_MSR 3 +.Os +.Sh NAME +.Nm pmc_get_msr +.Nd x86 architecture-specific PMC operations +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Ft int +.Fn pmc_get_msr "pmc_id_t pmc" "uint32_t *msr" +.Sh DESCRIPTION +The function +.Fn pmc_get_msr +returns the processor model specific register number associated with +a PMC for subsequent use with RDPMC instructions. +Argument +.Fa pmc +specifies a process scope counting PMC. +The function will write the model specific register number associated +with the PMC to the location pointed to by argument +.Fa msr . +.Pp +After successful completion of this function, applications +can directly read the contents of PMC hardware using +RDPMC instructions. +.Sh RETURN VALUES +.Rv -std pmc_get_msr +.Sh ERRORS +A call to +.Fn pmc_get_msr +may fail with the following errors: +.Bl -tag -width Er +.It Bq Er EINVAL +The PMC handle specified was invalid. +.It Bq Er EINVAL +The PMC specified did not have process scope or counting mode. +.It Bq Er EINVAL +The PMC specified was allocated with the +.Dv PMC_F_DESCENDANTS +flag. +.It Bq Er EINVAL +The specified PMC is already attached to target processes other +than the owner. +.It Bq Er ENOSYS +The underlying hardware does not support an RDPMC instruction. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr hwpmc 4 diff --git a/lib/libpmc/pmc_init.3 b/lib/libpmc/pmc_init.3 new file mode 100644 index 0000000..655bfb6 --- /dev/null +++ b/lib/libpmc/pmc_init.3 @@ -0,0 +1,63 @@ +.\" Copyright (c) 2007 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd November 24, 2007 +.Dt PMC_INIT 3 +.Os +.Sh NAME +.Nm pmc_init +.Nd initialize library +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Ft int +.Fn pmc_init void +.Sh DESCRIPTION +Function +.Fn pmc_init +initializes the PMC library. +This function must be called before any of the other functions in the +library. +.Sh RETURN VALUES +.Rv -std pmc_init +.Sh ERRORS +A call to +.Fn pmc_init +may fail with the following errors: +.Bl -tag -width Er +.It Bq Er ENOENT +The +.Xr hwpmc 4 +module was not found in the kernel. +.It Bq Er EPROGMISMATCH +The library's version number did not match that expected by +.Xr hwpmc 4 . +.It Bq Er ENXIO +PMC hardware on this system is unsupported. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr hwpmc 4 diff --git a/lib/libpmc/pmc_name_of_capability.3 b/lib/libpmc/pmc_name_of_capability.3 new file mode 100644 index 0000000..78efeaf --- /dev/null +++ b/lib/libpmc/pmc_name_of_capability.3 @@ -0,0 +1,140 @@ +.\" Copyright (c) 2007 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd November 24, 2007 +.Dt PMC_NAME_OF_CAPABILITY 3 +.Os +.Sh NAME +.Nm pmc_name_of_capability , +.Nm pmc_name_of_class , +.Nm pmc_name_of_cputype , +.Nm pmc_name_of_disposition , +.Nm pmc_name_of_event , +.Nm pmc_name_of_mode , +.Nm pmc_name_of_state +.Nd human readable names for numeric constants used by +.Xr pmc 3 +and +.Xr hwpmc 4 +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Ft "const char *" +.Fn pmc_name_of_capability "enum pmc_caps pc" +.Ft "const char *" +.Fn pmc_name_of_class "enum pmc_class pc" +.Ft "const char *" +.Fn pmc_name_of_cputype "enum pmc_cputype ct" +.Ft "const char *" +.Fn pmc_name_of_disposition "enum pmc_disp pd" +.Ft "const char *" +.Fn pmc_name_of_event "enum pmc_event pe" +.Ft "const char *" +.Fn pmc_name_of_mode "enum pmc_mode pm" +.Ft "const char *" +.Fn pmc_name_of_state "enum pmc_state ps" +.Sh DESCRIPTION +These convenience functions translate numeric constants used by the +.Lb libpmc +to +.Vt "const char *" +pointers to human readable representations of their arguments. +.Pp +Function +.Fn pmc_name_of_capability +translates a PMC capability flag given in argument +.Fa pc +to a human readable string. +PMC capabilities are described in +.Xr pmc 3 . +.Pp +Function +.Fn pmc_name_of_class +translates the PMC class value specified in argument +.Fa pc +to a human readable name. +PMC classes are described in +.Xr pmc 3 . +.Pp +Function +.Fn pmc_name_of_cputype +translates the CPU type value specified in argument +.Fa ct +to a human readable name. +CPU types known to the library are described in +.Xr pmc 3 . +.Pp +Function +.Fn pmc_name_of_disposition +translates the PMC row disposition specified in argument +.Fa pd +to a human readable name. +PMC row dispositions are described in +.Xr hwpmc 4 . +.Pp +Function +.Fn pmc_name_of_event +translates the PMC event number specified by argument +.Fa pe +to a string. +PMC event names are documented in section +.Sx EVENT SPECIFIERS +of +.Xr pmc 3 . +.Pp +Function +.Fn pmc_name_of_mode +translates the PMC mode specified by argument +.Fa pm +to a human readable string. +PMC modes are described in +.Xr pmc 3 . +.Pp +Function +.Fn pmc_name_of_state +translates the value of argument +.Fa ps +to a human readable name. +.Sh IMPLEMENTATION NOTES +The returned pointers point to static storage inside the PMC +library and should not be freed by the caller. +.Sh RETURN VALUES +These functions return a non-NULL pointer on successful completion. +In case of an error, a NULL pointer is returned and the global +variable +.Va errno +is set to indicate the error. +.Sh ERRORS +A call to these functions may fail with the following errors: +.Bl -tag -width Er +.It Bq Er EINVAL +The function argument specified an invalid value. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr pmc_cpuinfo 3 , +.Xr pmc_pmcinfo 3 , +.Xr hwpmc 4 diff --git a/lib/libpmc/pmc_read.3 b/lib/libpmc/pmc_read.3 new file mode 100644 index 0000000..d091716 --- /dev/null +++ b/lib/libpmc/pmc_read.3 @@ -0,0 +1,84 @@ +.\" Copyright (c) 2007 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd November 25, 2007 +.Dt PMC_READ 3 +.Os +.Sh NAME +.Nm pmc_read , +.Nm pmc_rw , +.Nm pmc_write , +.Nd read and write hardware performance counters +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Ft int +.Fn pmc_read "pmc_id_t pmc" "pmc_value_t *value" +.Ft int +.Fn pmc_rw "pmc_id_t pmc" "pmc_value_t newvalue" "pmc_value_t *oldvaluep" +.Ft int +.Fn pmc_write "pmc_id_t pmc" "pmc_value_t value" +.Sh DESCRIPTION +These functions read and write the current value of a PMC. +.Pp +Function +.Fn pmc_read +will read the current value of the PMC specified by argument +.Fa pmc +and write it to the location specified by argument +.Fa value . +.Pp +Function +.Fn pmc_write +will set the current value of the PMC specified by argument +.Fa pmc +to the value specified by argument +.Fa value . +.Pp +Function +.Fn pmc_rw +combines a read and a write into a single atomic operation. +.Pp +For write operations the PMC should be a quiescent state. +.Sh RETURN VALUES +.Rv -std +.Sh ERRORS +A call to these functions may fail with the following errors: +.Bl -tag -width Er +.It Bq Er EBUSY +A write operation specified a currently running PMC. +.It Bq Er EINVAL +Argument +.Fa pmc +specified a PMC not in a readable state. +.It Bq Er EINVAL +The PMC specified by argument +.Fa pmc +was not owned by the current process. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr hwpmc 4 diff --git a/lib/libpmc/pmc_set.3 b/lib/libpmc/pmc_set.3 new file mode 100644 index 0000000..e8d6597 --- /dev/null +++ b/lib/libpmc/pmc_set.3 @@ -0,0 +1,73 @@ +.\" Copyright (c) 2007 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd November 25, 2007 +.Dt PMC_SET 3 +.Os +.Sh NAME +.Nm pmc_set +.Nd set the reload count of a sampling PMC +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Ft int +.Fn pmc_set "pmc_id_t pmc" "pmc_value_t value" +.Sh DESCRIPTION +Function +.Fn pmc_set +is used to set the reload value of sampling PMCs. +Argument +.Fa pmc +specified the handle a previously allocate sampling mode PMC. +Argument +.Fa value +specifies the reload count. +.Pp +Sampling PMCs will interrupt the CPU after the number of +hardware events specified by the reload count are seen. +After the sampling interrupt is processed the underlying hardware will +be reloaded with the specified count and the hardware +automatically restarted by +.Xr hwpmc 4 . +.Pp +Function +.Fn pmc_set +should be called on PMC in a quiescent state. +.Sh RETURN VALUES +.Rv -std pmc_set +.Sh ERRORS +A call to +.Fn pmc_set +may fail with the following errors: +.Bl -tag -width Er +.It Bq Er EINVAL +The current process did not own a PMC with the specified handle. +.It Bq Er EBUSY +The specified PMC was already running. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr hwpmc 4 diff --git a/lib/libpmc/pmc_start.3 b/lib/libpmc/pmc_start.3 new file mode 100644 index 0000000..2272122 --- /dev/null +++ b/lib/libpmc/pmc_start.3 @@ -0,0 +1,77 @@ +.\" Copyright (c) 2007-2008 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd September 22, 2008 +.Dt PMC_START 3 +.Os +.Sh NAME +.Nm pmc_start , +.Nm pmc_stop +.Nd start and stop a PMC +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmc.h +.Ft int +.Fn pmc_start "pmc_id_t pmc" +.Ft int +.Fn pmc_stop "pmc_id_t pmc" +.Sh DESCRIPTION +These functions are used to start and stop a PMC. +.Pp +Function +.Fn pmc_start +starts the PMC specified by argument +.Fa pmc . +If the specified PMC has process scope and has not been attached +to any targets, it will be attached to the current process. +.Pp +Function +.Fn pmc_stop +stops the PMC specified by argument +.Fa pmc . +.Sh RETURN VALUES +.Rv -std +.Sh ERRORS +A call to these functions may fail with the following errors: +.Bl -tag -width Er +.It Bq Er EDOOFUS +Function +.Fn pmc_start +specified a PMC that requires a log file and no log file was +configured. +.It Bq Er EINVAL +The specified PMC is in the process of being deleted. +.It Bq Er EINVAL +Function +.Fn pmc_stop +specified a PMC that was never started. +.It Bq Er ENXIO +The specified PMC had system scope and its associated CPU was disabled or +absent. +.El +.Sh SEE ALSO +.Xr pmc 3 , +.Xr hwpmc 4 diff --git a/lib/libpmc/pmclog.3 b/lib/libpmc/pmclog.3 new file mode 100644 index 0000000..4438f10 --- /dev/null +++ b/lib/libpmc/pmclog.3 @@ -0,0 +1,320 @@ +.\" Copyright (c) 2005-2006 Joseph Koshy. All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions and the following disclaimer. +.\" 2. Redistributions in binary form must reproduce the above copyright +.\" notice, this list of conditions and the following disclaimer in the +.\" documentation and/or other materials provided with the distribution. +.\" +.\" This software is provided by Joseph Koshy ``as is'' and +.\" any express or implied warranties, including, but not limited to, the +.\" implied warranties of merchantability and fitness for a particular purpose +.\" are disclaimed. in no event shall Joseph Koshy be liable +.\" for any direct, indirect, incidental, special, exemplary, or consequential +.\" damages (including, but not limited to, procurement of substitute goods +.\" or services; loss of use, data, or profits; or business interruption) +.\" however caused and on any theory of liability, whether in contract, strict +.\" liability, or tort (including negligence or otherwise) arising in any way +.\" out of the use of this software, even if advised of the possibility of +.\" such damage. +.\" +.\" $FreeBSD$ +.\" +.Dd March 26, 2006 +.Dt PMCLOG 3 +.Os +.Sh NAME +.Nm pmclog_open , +.Nm pmclog_close , +.Nm pmclog_read , +.Nm pmclog_feed +.Nd parse event log data generated by +.Xr hwpmc 4 +.Sh LIBRARY +.Lb libpmc +.Sh SYNOPSIS +.In pmclog.h +.Ft "void *" +.Fn pmclog_open "int fd" +.Ft void +.Fn pmclog_close "void *cookie" +.Ft int +.Fn pmclog_read "void *cookie" "struct pmclog_ev *ev" +.Ft int +.Fn pmclog_feed "void *cookie" "char *data" "int len" +.Sh DESCRIPTION +These functions provide a way for application programs to extract +events from an event stream generated by +.Xr hwpmc 4 . +.Pp +A new event log parser is allocated using +.Fn pmclog_open . +Argument +.Fa fd +may be a file descriptor opened for reading if the event stream is +present in a file, or the constant +.Dv PMCLOG_FD_NONE +for an event stream present in memory. +This function returns a cookie that is passed into the other functions +in this API set. +.Pp +Function +.Fn pmclog_read +returns the next available event in the event stream associated with +argument +.Fa cookie . +Argument +.Fa ev +points to an event descriptor that which will contain the result of a +successfully parsed event. +.Pp +An event descriptor returned by +.Fn pmclog_read +has the following structure: +.Bd -literal +struct pmclog_ev { + enum pmclog_state pl_state; /* parser state after 'get_event()' */ + off_t pl_offset; /* byte offset in stream */ + size_t pl_count; /* count of records so far */ + struct timespec pl_ts; /* log entry timestamp */ + enum pmclog_type pl_type; /* log entry kind */ + union { /* log entry data */ + struct pmclog_ev_closelog pl_cl; + struct pmclog_ev_dropnotify pl_d; + struct pmclog_ev_initialize pl_i; + struct pmclog_ev_map_in pl_mi; + struct pmclog_ev_map_out pl_mo; + struct pmclog_ev_pcsample pl_s; + struct pmclog_ev_pmcallocate pl_a; + struct pmclog_ev_pmcattach pl_t; + struct pmclog_ev_pmcdetach pl_d; + struct pmclog_ev_proccsw pl_c; + struct pmclog_ev_procexec pl_x; + struct pmclog_ev_procexit pl_e; + struct pmclog_ev_procfork pl_f; + struct pmclog_ev_sysexit pl_e; + struct pmclog_ev_userdata pl_u; + } pl_u; +}; +.Ed +.Pp +The current state of the parser is recorded in +.Va pl_state . +This field can take on the following values: +.Bl -tag -width ".Dv PMCLOG_REQUIRE_DATA" +.It Dv PMCLOG_EOF +(For file based parsers only) +An end-of-file condition was encountered on the configured file +descriptor. +.It Dv PMCLOG_ERROR +An error occurred during parsing. +.It Dv PMCLOG_OK +A complete event record was read into +.Fa *ev . +.It Dv PMCLOG_REQUIRE_DATA +There was insufficient data in the event stream to assemble a complete +event record. +For memory based parsers, more data can be fed to the +parser using function +.Fn pmclog_feed . +For file based parsers, function +.Fn pmclog_read +may be retried when data is available on the configured file +descriptor. +.El +.Pp +The rest of the event structure is valid only if field +.Va pl_state +contains +.Dv PMCLOG_OK . +Field +.Va pl_offset +contains the offset of the current record in the byte stream. +Field +.Va pl_count +contains the serial number of this event. +Field +.Va pl_ts +contains a timestamp with the system time when the event occurred. +Field +.Va pl_type +denotes the kind of the event returned in argument +.Fa *ev +and is one of the following: +.Bl -tag -width ".Dv PMCLOG_TYPE_PMCALLOCATE" +.It Dv PMCLOG_TYPE_CLOSELOG +A marker indicating a successful close of a log file. +This record will be the last record of a log file. +.It Dv PMCLOG_TYPE_DROPNOTIFY +A marker indicating that +.Xr hwpmc 4 +had to drop data due to a resource constraint. +.It Dv PMCLOG_TYPE_INITIALIZE +An initialization record. +This is the first record in a log file. +.It Dv PMCLOG_TYPE_MAP_IN +A record describing the introduction of a mapping to an executable +object by a +.Xr kldload 2 +or +.Xr mmap 2 +system call. +.It Dv PMCLOG_TYPE_MAP_OUT +A record describing the removal of a mapping to an executable +object by a +.Xr kldunload 2 +or +.Xr munmap 2 +system call. +.It Dv PMCLOG_TYPE_PCSAMPLE +A record containing an instruction pointer sample. +.It Dv PMCLOG_TYPE_PMCALLOCATE +A record describing a PMC allocation operation. +.It Dv PMCLOG_TYPE_PMCATTACH +A record describing a PMC attach operation. +.It Dv PMCLOG_TYPE_PMCDETACH +A record describing a PMC detach operation. +.It Dv PMCLOG_TYPE_PROCCSW +A record describing a PMC reading at the time of a process context switch. +.It Dv PMCLOG_TYPE_PROCEXEC +A record describing an +.Xr execve 2 +by a target process. +.It Dv PMCLOG_TYPE_PROCEXIT +A record describing the accumulated PMC reading for a process at the +time of +.Xr _exit 2 . +.It Dv PMCLOG_TYPE_PROCFORK +A record describing a +.Xr fork 2 +by a target process. +.It Dv PMCLOG_TYPE_SYSEXIT +A record describing a process exit, sent to processes +owning system-wide sampling PMCs. +.It Dv PMCLOG_TYPE_USERDATA +A record containing user data. +.El +.Pp +Function +.Fn pmclog_feed +is used with parsers configured to parse memory based event streams. +It is intended to be called when function +.Fn pmclog_read +indicates the need for more data by a returning +.Dv PMCLOG_REQUIRE_DATA +in field +.Va pl_state +of its event structure argument. +Argument +.Fa data +points to the start of a memory buffer containing fresh event data. +Argument +.Fa len +indicates the number of data bytes available. +The memory range +.Bq Fa data , Fa data No + Fa len +must remain valid till the next time +.Fn pmclog_read +returns an error. +It is an error to use +.Fn pmclog_feed +on a parser configured to parse file data. +.Pp +Function +.Fn pmclog_close +releases the internal state allocated by a prior call +to +.Fn pmclog_open . +.Sh RETURN VALUES +Function +.Fn pmclog_open +will return a +.No non- Ns Dv NULL +value if successful or +.Dv NULL +otherwise. +.Pp +Function +.Fn pmclog_read +will return 0 in case a complete event record was successfully read, +or will return \-1 and will set the +.Va pl_state +field of the event record to the appropriate code in case of an error. +.Pp +Function +.Fn pmclog_feed +will return 0 on success or \-1 in case of failure. +.Sh EXAMPLES +A template for using the log file parsing API is shown below in pseudocode: +.Bd -literal +void *parser; /* cookie */ +struct pmclog_ev ev; /* parsed event */ +int fd; /* file descriptor */ + +fd = open(filename, O_RDONLY); /* open log file */ +parser = pmclog_open(fd); /* initialize parser */ +if (parser == NULL) + --handle an out of memory error--; + +/* read and parse data */ +while (pmclog_read(parser, &ev) == 0) { + assert(ev.pl_state == PMCLOG_OK); + /* process the event */ + switch (ev.pl_type) { + case PMCLOG_TYPE_ALLOCATE: + --process a pmc allocation record-- + break; + case PMCLOG_TYPE_PROCCSW: + --process a thread context switch record-- + break; + case PMCLOG_TYPE_PCSAMPLE: + --process a PC sample-- + break; + --and so on-- + } +} + +/* examine parser state */ +switch (ev.pl_state) { +case PMCLOG_EOF: + --normal termination-- + break; +case PMCLOG_ERROR: + --look at errno here-- + break; +case PMCLOG_REQUIRE_DATA: + --arrange for more data to be available for parsing-- + break; +default: + assert(0); + /*NOTREACHED*/ +} + +pmclog_close(parser); /* cleanup */ +.Ed +.Sh ERRORS +A call to +.Fn pmclog_init_parser +may fail with any of the errors returned by +.Xr malloc 3 . +.Pp +A call to +.Fn pmclog_read +for a file based parser may fail with any of the errors returned by +.Xr read 2 . +.Sh SEE ALSO +.Xr read 2 , +.Xr malloc 3 , +.Xr pmc 3 , +.Xr hwpmc 4 , +.Xr pmcstat 8 +.Sh HISTORY +The +.Nm pmclog +API +.Ud +It first appeared in +.Fx 6.0 . diff --git a/lib/libpmc/pmclog.c b/lib/libpmc/pmclog.c new file mode 100644 index 0000000..d9ebc67 --- /dev/null +++ b/lib/libpmc/pmclog.c @@ -0,0 +1,577 @@ +/*- + * Copyright (c) 2005-2007 Joseph Koshy + * Copyright (c) 2007 The FreeBSD Foundation + * All rights reserved. + * + * Portions of this software were developed by A. Joseph Koshy under + * sponsorship from the FreeBSD Foundation and Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/param.h> +#include <sys/pmc.h> +#include <sys/pmclog.h> + +#include <assert.h> +#include <errno.h> +#include <pmc.h> +#include <pmclog.h> +#include <stddef.h> +#include <stdlib.h> +#include <string.h> +#include <strings.h> +#include <unistd.h> + +#include <machine/pmc_mdep.h> + +#include "libpmcinternal.h" + +#define PMCLOG_BUFFER_SIZE 4096 + +/* + * API NOTES + * + * The pmclog(3) API is oriented towards parsing an event stream in + * "realtime", i.e., from an data source that may or may not preserve + * record boundaries -- for example when the data source is elsewhere + * on a network. The API allows data to be fed into the parser zero + * or more bytes at a time. + * + * The state for a log file parser is maintained in a 'struct + * pmclog_parse_state'. Parser invocations are done by calling + * 'pmclog_read()'; this function will inform the caller when a + * complete event is parsed. + * + * The parser first assembles a complete log file event in an internal + * work area (see "ps_saved" below). Once a complete log file event + * is read, the parser then parses it and converts it to an event + * descriptor usable by the client. We could possibly avoid this two + * step process by directly parsing the input log to set fields in the + * event record. However the parser's state machine would get + * insanely complicated, and this code is unlikely to be used in + * performance critical paths. + */ + +enum pmclog_parser_state { + PL_STATE_NEW_RECORD, /* in-between records */ + PL_STATE_EXPECTING_HEADER, /* header being read */ + PL_STATE_PARTIAL_RECORD, /* header present but not the record */ + PL_STATE_ERROR /* parsing error encountered */ +}; + +struct pmclog_parse_state { + enum pmclog_parser_state ps_state; + enum pmc_cputype ps_arch; /* log file architecture */ + uint32_t ps_version; /* hwpmc version */ + int ps_initialized; /* whether initialized */ + int ps_count; /* count of records processed */ + off_t ps_offset; /* stream byte offset */ + union pmclog_entry ps_saved; /* saved partial log entry */ + int ps_svcount; /* #bytes saved */ + int ps_fd; /* active fd or -1 */ + char *ps_buffer; /* scratch buffer if fd != -1 */ + char *ps_data; /* current parse pointer */ + size_t ps_len; /* length of buffered data */ +}; + +#define PMCLOG_HEADER_FROM_SAVED_STATE(PS) \ + (* ((uint32_t *) &(PS)->ps_saved)) + +#define PMCLOG_INITIALIZE_READER(LE,A) LE = (uint32_t *) &(A) +#define PMCLOG_READ32(LE,V) do { \ + (V) = *(LE)++; \ + } while (0) +#define PMCLOG_READ64(LE,V) do { \ + uint64_t _v; \ + _v = (uint64_t) *(LE)++; \ + _v |= ((uint64_t) *(LE)++) << 32; \ + (V) = _v; \ + } while (0) + +#define PMCLOG_READSTRING(LE,DST,LEN) strlcpy((DST), (char *) (LE), (LEN)) + +/* + * Assemble a log record from '*len' octets starting from address '*data'. + * Update 'data' and 'len' to reflect the number of bytes consumed. + * + * '*data' is potentially an unaligned address and '*len' octets may + * not be enough to complete a event record. + */ + +static enum pmclog_parser_state +pmclog_get_record(struct pmclog_parse_state *ps, char **data, ssize_t *len) +{ + int avail, copylen, recordsize, used; + uint32_t h; + const int HEADERSIZE = sizeof(uint32_t); + char *src, *dst; + + if ((avail = *len) <= 0) + return (ps->ps_state = PL_STATE_ERROR); + + src = *data; + h = used = 0; + + if (ps->ps_state == PL_STATE_NEW_RECORD) + ps->ps_svcount = 0; + + dst = (char *) &ps->ps_saved + ps->ps_svcount; + + switch (ps->ps_state) { + case PL_STATE_NEW_RECORD: + + /* + * Transitions: + * + * Case A: avail < headersize + * -> 'expecting header' + * + * Case B: avail >= headersize + * B.1: avail < recordsize + * -> 'partial record' + * B.2: avail >= recordsize + * -> 'new record' + */ + + copylen = avail < HEADERSIZE ? avail : HEADERSIZE; + bcopy(src, dst, copylen); + ps->ps_svcount = used = copylen; + + if (copylen < HEADERSIZE) { + ps->ps_state = PL_STATE_EXPECTING_HEADER; + goto done; + } + + src += copylen; + dst += copylen; + + h = PMCLOG_HEADER_FROM_SAVED_STATE(ps); + recordsize = PMCLOG_HEADER_TO_LENGTH(h); + + if (recordsize <= 0) + goto error; + + if (recordsize <= avail) { /* full record available */ + bcopy(src, dst, recordsize - copylen); + ps->ps_svcount = used = recordsize; + goto done; + } + + /* header + a partial record is available */ + bcopy(src, dst, avail - copylen); + ps->ps_svcount = used = avail; + ps->ps_state = PL_STATE_PARTIAL_RECORD; + + break; + + case PL_STATE_EXPECTING_HEADER: + + /* + * Transitions: + * + * Case C: avail+saved < headersize + * -> 'expecting header' + * + * Case D: avail+saved >= headersize + * D.1: avail+saved < recordsize + * -> 'partial record' + * D.2: avail+saved >= recordsize + * -> 'new record' + * (see PARTIAL_RECORD handling below) + */ + + if (avail + ps->ps_svcount < HEADERSIZE) { + bcopy(src, dst, avail); + ps->ps_svcount += avail; + used = avail; + break; + } + + used = copylen = HEADERSIZE - ps->ps_svcount; + bcopy(src, dst, copylen); + src += copylen; + dst += copylen; + avail -= copylen; + ps->ps_svcount += copylen; + + /*FALLTHROUGH*/ + + case PL_STATE_PARTIAL_RECORD: + + /* + * Transitions: + * + * Case E: avail+saved < recordsize + * -> 'partial record' + * + * Case F: avail+saved >= recordsize + * -> 'new record' + */ + + h = PMCLOG_HEADER_FROM_SAVED_STATE(ps); + recordsize = PMCLOG_HEADER_TO_LENGTH(h); + + if (recordsize <= 0) + goto error; + + if (avail + ps->ps_svcount < recordsize) { + copylen = avail; + ps->ps_state = PL_STATE_PARTIAL_RECORD; + } else { + copylen = recordsize - ps->ps_svcount; + ps->ps_state = PL_STATE_NEW_RECORD; + } + + bcopy(src, dst, copylen); + ps->ps_svcount += copylen; + used += copylen; + break; + + default: + goto error; + } + + done: + *data += used; + *len -= used; + return ps->ps_state; + + error: + ps->ps_state = PL_STATE_ERROR; + return ps->ps_state; +} + +/* + * Get an event from the stream pointed to by '*data'. '*len' + * indicates the number of bytes available to parse. Arguments + * '*data' and '*len' are updated to indicate the number of bytes + * consumed. + */ + +static int +pmclog_get_event(void *cookie, char **data, ssize_t *len, + struct pmclog_ev *ev) +{ + int evlen, pathlen; + uint32_t h, *le, npc; + enum pmclog_parser_state e; + struct pmclog_parse_state *ps; + + ps = (struct pmclog_parse_state *) cookie; + + assert(ps->ps_state != PL_STATE_ERROR); + + if ((e = pmclog_get_record(ps,data,len)) == PL_STATE_ERROR) { + ev->pl_state = PMCLOG_ERROR; + return -1; + } + + if (e != PL_STATE_NEW_RECORD) { + ev->pl_state = PMCLOG_REQUIRE_DATA; + return -1; + } + + PMCLOG_INITIALIZE_READER(le, ps->ps_saved); + + PMCLOG_READ32(le,h); + + if (!PMCLOG_HEADER_CHECK_MAGIC(h)) { + ps->ps_state = PL_STATE_ERROR; + ev->pl_state = PMCLOG_ERROR; + return -1; + } + + /* copy out the time stamp */ + PMCLOG_READ32(le,ev->pl_ts.tv_sec); + PMCLOG_READ32(le,ev->pl_ts.tv_nsec); + + evlen = PMCLOG_HEADER_TO_LENGTH(h); + +#define PMCLOG_GET_PATHLEN(P,E,TYPE) do { \ + (P) = (E) - offsetof(struct TYPE, pl_pathname); \ + if ((P) > PATH_MAX || (P) < 0) \ + goto error; \ + } while (0) + +#define PMCLOG_GET_CALLCHAIN_SIZE(SZ,E) do { \ + (SZ) = ((E) - offsetof(struct pmclog_callchain, pl_pc)) \ + / sizeof(uintfptr_t); \ + } while (0); + + switch (ev->pl_type = PMCLOG_HEADER_TO_TYPE(h)) { + case PMCLOG_TYPE_CALLCHAIN: + PMCLOG_READ32(le,ev->pl_u.pl_cc.pl_pid); + PMCLOG_READ32(le,ev->pl_u.pl_cc.pl_pmcid); + PMCLOG_READ32(le,ev->pl_u.pl_cc.pl_cpuflags); + PMCLOG_GET_CALLCHAIN_SIZE(ev->pl_u.pl_cc.pl_npc,evlen); + for (npc = 0; npc < ev->pl_u.pl_cc.pl_npc; npc++) + PMCLOG_READADDR(le,ev->pl_u.pl_cc.pl_pc[npc]); + for (;npc < PMC_CALLCHAIN_DEPTH_MAX; npc++) + ev->pl_u.pl_cc.pl_pc[npc] = (uintfptr_t) 0; + break; + case PMCLOG_TYPE_CLOSELOG: + case PMCLOG_TYPE_DROPNOTIFY: + /* nothing to do */ + break; + case PMCLOG_TYPE_INITIALIZE: + PMCLOG_READ32(le,ev->pl_u.pl_i.pl_version); + PMCLOG_READ32(le,ev->pl_u.pl_i.pl_arch); + ps->ps_version = ev->pl_u.pl_i.pl_version; + ps->ps_arch = ev->pl_u.pl_i.pl_arch; + ps->ps_initialized = 1; + break; + case PMCLOG_TYPE_MAP_IN: + PMCLOG_GET_PATHLEN(pathlen,evlen,pmclog_map_in); + PMCLOG_READ32(le,ev->pl_u.pl_mi.pl_pid); + PMCLOG_READADDR(le,ev->pl_u.pl_mi.pl_start); + PMCLOG_READSTRING(le, ev->pl_u.pl_mi.pl_pathname, pathlen); + break; + case PMCLOG_TYPE_MAP_OUT: + PMCLOG_READ32(le,ev->pl_u.pl_mo.pl_pid); + PMCLOG_READADDR(le,ev->pl_u.pl_mo.pl_start); + PMCLOG_READADDR(le,ev->pl_u.pl_mo.pl_end); + break; + case PMCLOG_TYPE_PCSAMPLE: + PMCLOG_READ32(le,ev->pl_u.pl_s.pl_pid); + PMCLOG_READADDR(le,ev->pl_u.pl_s.pl_pc); + PMCLOG_READ32(le,ev->pl_u.pl_s.pl_pmcid); + PMCLOG_READ32(le,ev->pl_u.pl_s.pl_usermode); + break; + case PMCLOG_TYPE_PMCALLOCATE: + PMCLOG_READ32(le,ev->pl_u.pl_a.pl_pmcid); + PMCLOG_READ32(le,ev->pl_u.pl_a.pl_event); + PMCLOG_READ32(le,ev->pl_u.pl_a.pl_flags); + if ((ev->pl_u.pl_a.pl_evname = + _pmc_name_of_event(ev->pl_u.pl_a.pl_event, ps->ps_arch)) + == NULL) + goto error; + break; + case PMCLOG_TYPE_PMCATTACH: + PMCLOG_GET_PATHLEN(pathlen,evlen,pmclog_pmcattach); + PMCLOG_READ32(le,ev->pl_u.pl_t.pl_pmcid); + PMCLOG_READ32(le,ev->pl_u.pl_t.pl_pid); + PMCLOG_READSTRING(le,ev->pl_u.pl_t.pl_pathname,pathlen); + break; + case PMCLOG_TYPE_PMCDETACH: + PMCLOG_READ32(le,ev->pl_u.pl_d.pl_pmcid); + PMCLOG_READ32(le,ev->pl_u.pl_d.pl_pid); + break; + case PMCLOG_TYPE_PROCCSW: + PMCLOG_READ32(le,ev->pl_u.pl_c.pl_pmcid); + PMCLOG_READ64(le,ev->pl_u.pl_c.pl_value); + PMCLOG_READ32(le,ev->pl_u.pl_c.pl_pid); + break; + case PMCLOG_TYPE_PROCEXEC: + PMCLOG_GET_PATHLEN(pathlen,evlen,pmclog_procexec); + PMCLOG_READ32(le,ev->pl_u.pl_x.pl_pid); + PMCLOG_READADDR(le,ev->pl_u.pl_x.pl_entryaddr); + PMCLOG_READ32(le,ev->pl_u.pl_x.pl_pmcid); + PMCLOG_READSTRING(le,ev->pl_u.pl_x.pl_pathname,pathlen); + break; + case PMCLOG_TYPE_PROCEXIT: + PMCLOG_READ32(le,ev->pl_u.pl_e.pl_pmcid); + PMCLOG_READ64(le,ev->pl_u.pl_e.pl_value); + PMCLOG_READ32(le,ev->pl_u.pl_e.pl_pid); + break; + case PMCLOG_TYPE_PROCFORK: + PMCLOG_READ32(le,ev->pl_u.pl_f.pl_oldpid); + PMCLOG_READ32(le,ev->pl_u.pl_f.pl_newpid); + break; + case PMCLOG_TYPE_SYSEXIT: + PMCLOG_READ32(le,ev->pl_u.pl_se.pl_pid); + break; + case PMCLOG_TYPE_USERDATA: + PMCLOG_READ32(le,ev->pl_u.pl_u.pl_userdata); + break; + default: /* unknown record type */ + ps->ps_state = PL_STATE_ERROR; + ev->pl_state = PMCLOG_ERROR; + return (-1); + } + + ev->pl_offset = (ps->ps_offset += evlen); + ev->pl_count = (ps->ps_count += 1); + ev->pl_state = PMCLOG_OK; + return 0; + + error: + ev->pl_state = PMCLOG_ERROR; + ps->ps_state = PL_STATE_ERROR; + return -1; +} + +/* + * Extract and return the next event from the byte stream. + * + * Returns 0 and sets the event's state to PMCLOG_OK in case an event + * was successfully parsed. Otherwise this function returns -1 and + * sets the event's state to one of PMCLOG_REQUIRE_DATA (if more data + * is needed) or PMCLOG_EOF (if an EOF was seen) or PMCLOG_ERROR if + * a parse error was encountered. + */ + +int +pmclog_read(void *cookie, struct pmclog_ev *ev) +{ + int retval; + ssize_t nread; + struct pmclog_parse_state *ps; + + ps = (struct pmclog_parse_state *) cookie; + + if (ps->ps_state == PL_STATE_ERROR) { + ev->pl_state = PMCLOG_ERROR; + return -1; + } + + /* + * If there isn't enough data left for a new event try and get + * more data. + */ + if (ps->ps_len == 0) { + ev->pl_state = PMCLOG_REQUIRE_DATA; + + /* + * If we have a valid file descriptor to read from, attempt + * to read from that. This read may return with an error, + * (which may be EAGAIN or other recoverable error), or + * can return EOF. + */ + if (ps->ps_fd != PMCLOG_FD_NONE) { + refill: + nread = read(ps->ps_fd, ps->ps_buffer, + PMCLOG_BUFFER_SIZE); + + if (nread <= 0) { + if (nread == 0) + ev->pl_state = PMCLOG_EOF; + else if (errno != EAGAIN) /* not restartable */ + ev->pl_state = PMCLOG_ERROR; + return -1; + } + + ps->ps_len = nread; + ps->ps_data = ps->ps_buffer; + } else + return -1; + } + + assert(ps->ps_len > 0); + + + /* Retrieve one event from the byte stream. */ + retval = pmclog_get_event(ps, &ps->ps_data, &ps->ps_len, ev); + + /* + * If we need more data and we have a configured fd, try read + * from it. + */ + if (retval < 0 && ev->pl_state == PMCLOG_REQUIRE_DATA && + ps->ps_fd != -1) { + assert(ps->ps_len == 0); + goto refill; + } + + return retval; +} + +/* + * Feed data to a memory based parser. + * + * The memory area pointed to by 'data' needs to be valid till the + * next error return from pmclog_next_event(). + */ + +int +pmclog_feed(void *cookie, char *data, int len) +{ + struct pmclog_parse_state *ps; + + ps = (struct pmclog_parse_state *) cookie; + + if (len < 0 || /* invalid length */ + ps->ps_buffer || /* called for a file parser */ + ps->ps_len != 0) /* unnecessary call */ + return -1; + + ps->ps_data = data; + ps->ps_len = len; + + return 0; +} + +/* + * Allocate and initialize parser state. + */ + +void * +pmclog_open(int fd) +{ + struct pmclog_parse_state *ps; + + if ((ps = (struct pmclog_parse_state *) malloc(sizeof(*ps))) == NULL) + return NULL; + + ps->ps_state = PL_STATE_NEW_RECORD; + ps->ps_arch = -1; + ps->ps_initialized = 0; + ps->ps_count = 0; + ps->ps_offset = (off_t) 0; + bzero(&ps->ps_saved, sizeof(ps->ps_saved)); + ps->ps_svcount = 0; + ps->ps_fd = fd; + ps->ps_data = NULL; + ps->ps_buffer = NULL; + ps->ps_len = 0; + + /* allocate space for a work area */ + if (ps->ps_fd != PMCLOG_FD_NONE) { + if ((ps->ps_buffer = malloc(PMCLOG_BUFFER_SIZE)) == NULL) { + free(ps); + return NULL; + } + } + + return ps; +} + + +/* + * Free up parser state. + */ + +void +pmclog_close(void *cookie) +{ + struct pmclog_parse_state *ps; + + ps = (struct pmclog_parse_state *) cookie; + + if (ps->ps_buffer) + free(ps->ps_buffer); + + free(ps); +} diff --git a/lib/libpmc/pmclog.h b/lib/libpmc/pmclog.h new file mode 100644 index 0000000..b7c9c84 --- /dev/null +++ b/lib/libpmc/pmclog.h @@ -0,0 +1,170 @@ +/*- + * Copyright (c) 2005-2007 Joseph Koshy + * Copyright (c) 2007 The FreeBSD Foundation + * All rights reserved. + * + * Portions of this software were developed by A. Joseph Koshy under + * sponsorship from the FreeBSD Foundation and Google, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _PMCLOG_H_ +#define _PMCLOG_H_ + +#include <sys/cdefs.h> +#include <sys/pmclog.h> + +enum pmclog_state { + PMCLOG_OK, + PMCLOG_EOF, + PMCLOG_REQUIRE_DATA, + PMCLOG_ERROR +}; + +struct pmclog_ev_callchain { + uint32_t pl_pid; + uint32_t pl_pmcid; + uint32_t pl_cpuflags; + uint32_t pl_npc; + uintfptr_t pl_pc[PMC_CALLCHAIN_DEPTH_MAX]; +}; + +struct pmclog_ev_dropnotify { +}; + +struct pmclog_ev_closelog { +}; + +struct pmclog_ev_initialize { + uint32_t pl_version; + uint32_t pl_arch; +}; + +struct pmclog_ev_map_in { + pid_t pl_pid; + uintfptr_t pl_start; + char pl_pathname[PATH_MAX]; +}; + +struct pmclog_ev_map_out { + pid_t pl_pid; + uintfptr_t pl_start; + uintfptr_t pl_end; +}; + +struct pmclog_ev_pcsample { + uintfptr_t pl_pc; + pid_t pl_pid; + pmc_id_t pl_pmcid; + uint32_t pl_usermode; +}; + +struct pmclog_ev_pmcallocate { + uint32_t pl_event; + const char * pl_evname; + uint32_t pl_flags; + pmc_id_t pl_pmcid; +}; + +struct pmclog_ev_pmcattach { + pmc_id_t pl_pmcid; + pid_t pl_pid; + char pl_pathname[PATH_MAX]; +}; + +struct pmclog_ev_pmcdetach { + pmc_id_t pl_pmcid; + pid_t pl_pid; +}; + +struct pmclog_ev_proccsw { + pid_t pl_pid; + pmc_id_t pl_pmcid; + pmc_value_t pl_value; +}; + +struct pmclog_ev_procexec { + pid_t pl_pid; + pmc_id_t pl_pmcid; + uintfptr_t pl_entryaddr; + char pl_pathname[PATH_MAX]; +}; + +struct pmclog_ev_procexit { + uint32_t pl_pid; + pmc_id_t pl_pmcid; + pmc_value_t pl_value; +}; + +struct pmclog_ev_procfork { + pid_t pl_oldpid; + pid_t pl_newpid; +}; + +struct pmclog_ev_sysexit { + pid_t pl_pid; +}; + +struct pmclog_ev_userdata { + uint32_t pl_userdata; +}; + +struct pmclog_ev { + enum pmclog_state pl_state; /* state after 'get_event()' */ + off_t pl_offset; /* byte offset in stream */ + size_t pl_count; /* count of records so far */ + struct timespec pl_ts; /* log entry timestamp */ + enum pmclog_type pl_type; /* type of log entry */ + union { /* log entry data */ + struct pmclog_ev_callchain pl_cc; + struct pmclog_ev_closelog pl_cl; + struct pmclog_ev_dropnotify pl_dn; + struct pmclog_ev_initialize pl_i; + struct pmclog_ev_map_in pl_mi; + struct pmclog_ev_map_out pl_mo; + struct pmclog_ev_pcsample pl_s; + struct pmclog_ev_pmcallocate pl_a; + struct pmclog_ev_pmcattach pl_t; + struct pmclog_ev_pmcdetach pl_d; + struct pmclog_ev_proccsw pl_c; + struct pmclog_ev_procexec pl_x; + struct pmclog_ev_procexit pl_e; + struct pmclog_ev_procfork pl_f; + struct pmclog_ev_sysexit pl_se; + struct pmclog_ev_userdata pl_u; + } pl_u; +}; + +#define PMCLOG_FD_NONE (-1) + +__BEGIN_DECLS +void *pmclog_open(int _fd); +int pmclog_feed(void *_cookie, char *_data, int _len); +int pmclog_read(void *_cookie, struct pmclog_ev *_ev); +void pmclog_close(void *_cookie); +__END_DECLS + +#endif + |