diff options
-rw-r--r-- | sys/amd64/amd64/apic_vector.S | 12 | ||||
-rw-r--r-- | sys/amd64/amd64/mca.c | 251 | ||||
-rw-r--r-- | sys/amd64/include/apicreg.h | 4 | ||||
-rw-r--r-- | sys/amd64/include/apicvar.h | 12 | ||||
-rw-r--r-- | sys/amd64/include/mca.h | 1 | ||||
-rw-r--r-- | sys/amd64/include/pcpu.h | 3 | ||||
-rw-r--r-- | sys/amd64/include/specialreg.h | 2 | ||||
-rw-r--r-- | sys/i386/i386/apic_vector.s | 13 | ||||
-rw-r--r-- | sys/i386/i386/mca.c | 251 | ||||
-rw-r--r-- | sys/i386/include/apicreg.h | 4 | ||||
-rw-r--r-- | sys/i386/include/apicvar.h | 15 | ||||
-rw-r--r-- | sys/i386/include/mca.h | 1 | ||||
-rw-r--r-- | sys/i386/include/pcpu.h | 4 | ||||
-rw-r--r-- | sys/i386/include/specialreg.h | 2 | ||||
-rw-r--r-- | sys/x86/x86/local_apic.c | 40 |
15 files changed, 557 insertions, 58 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S index 4cfc18b..1c044b8 100644 --- a/sys/amd64/amd64/apic_vector.S +++ b/sys/amd64/amd64/apic_vector.S @@ -105,6 +105,18 @@ IDTVEC(timerint) jmp doreti /* + * Local APIC CMCI handler. + */ + .text + SUPERALIGN_TEXT +IDTVEC(cmcint) + PUSH_FRAME + FAKE_MCOUNT(TF_RIP(%rsp)) + call lapic_handle_cmc + MEXITCOUNT + jmp doreti + +/* * Local APIC error interrupt handler. */ .text diff --git a/sys/amd64/amd64/mca.c b/sys/amd64/amd64/mca.c index ccbab17..f3b7e9e 100644 --- a/sys/amd64/amd64/mca.c +++ b/sys/amd64/amd64/mca.c @@ -33,6 +33,8 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> +#include <sys/bus.h> +#include <sys/interrupt.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> @@ -43,11 +45,29 @@ __FBSDID("$FreeBSD$"); #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/taskqueue.h> +#include <machine/intr_machdep.h> +#include <machine/apicvar.h> #include <machine/cputypes.h> #include <machine/mca.h> #include <machine/md_var.h> #include <machine/specialreg.h> +/* Modes for mca_scan() */ +enum scan_mode { + POLLED, + MCE, + CMCI, +}; + +/* + * State maintained for each monitored MCx bank to control the + * corrected machine check interrupt threshold. + */ +struct cmc_state { + int max_threshold; + int last_intr; +}; + struct mca_internal { struct mca_record rec; int logged; @@ -79,19 +99,22 @@ static struct callout mca_timer; static int mca_ticks = 3600; /* Check hourly by default. */ static struct task mca_task; static struct mtx mca_lock; +static struct cmc_state **cmc_state; /* Indexed by cpuid, bank */ +static int cmc_banks; +static int cmc_throttle = 60; /* Time in seconds to throttle CMCI. */ static int -sysctl_mca_ticks(SYSCTL_HANDLER_ARGS) +sysctl_positive_int(SYSCTL_HANDLER_ARGS) { int error, value; - value = mca_ticks; + value = *(int *)arg1; error = sysctl_handle_int(oidp, &value, 0, req); if (error || req->newptr == NULL) return (error); if (value <= 0) return (EINVAL); - mca_ticks = value; + *(int *)arg1 = value; return (0); } @@ -401,31 +424,112 @@ mca_record_entry(const struct mca_record *record) } /* + * Update the interrupt threshold for a CMCI. The strategy is to use + * a low trigger that interrupts as soon as the first event occurs. + * However, if a steady stream of events arrive, the threshold is + * increased until the interrupts are throttled to once every + * cmc_throttle seconds or the periodic scan. If a periodic scan + * finds that the threshold is too high, it is lowered. + */ +static void +cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec) +{ + struct cmc_state *cc; + uint64_t ctl; + u_int delta; + int count, limit; + + /* Fetch the current limit for this bank. */ + cc = &cmc_state[PCPU_GET(cpuid)][bank]; + ctl = rdmsr(MSR_MC_CTL2(bank)); + count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38; + delta = (u_int)(ticks - cc->last_intr); + + /* + * If an interrupt was received less than cmc_throttle seconds + * since the previous interrupt and the count from the current + * event is greater than or equal to the current threshold, + * double the threshold up to the max. + */ + if (mode == CMCI && valid) { + limit = ctl & MC_CTL2_THRESHOLD; + if (delta < cmc_throttle && count >= limit && + limit < cc->max_threshold) { + limit = min(limit << 1, cc->max_threshold); + ctl &= ~MC_CTL2_THRESHOLD; + ctl |= limit; + wrmsr(MSR_MC_CTL2(bank), limit); + } + cc->last_intr = ticks; + return; + } + + /* + * When the banks are polled, check to see if the threshold + * should be lowered. + */ + if (mode != POLLED) + return; + + /* If a CMCI occured recently, do nothing for now. */ + if (delta < cmc_throttle) + return; + + /* + * Compute a new limit based on the average rate of events per + * cmc_throttle seconds since the last interrupt. + */ + if (valid) { + count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38; + limit = count * cmc_throttle / delta; + if (limit <= 0) + limit = 1; + else if (limit > cc->max_threshold) + limit = cc->max_threshold; + } else + limit = 1; + if ((ctl & MC_CTL2_THRESHOLD) != limit) { + ctl &= ~MC_CTL2_THRESHOLD; + ctl |= limit; + wrmsr(MSR_MC_CTL2(bank), limit); + } +} + +/* * This scans all the machine check banks of the current CPU to see if * there are any machine checks. Any non-recoverable errors are * reported immediately via mca_log(). The current thread must be - * pinned when this is called. The 'mcip' parameter indicates if we - * are being called from the MC exception handler. In that case this - * function returns true if the system is restartable. Otherwise, it - * returns a count of the number of valid MC records found. + * pinned when this is called. The 'mode' parameter indicates if we + * are being called from the MC exception handler, the CMCI handler, + * or the periodic poller. In the MC exception case this function + * returns true if the system is restartable. Otherwise, it returns a + * count of the number of valid MC records found. */ static int -mca_scan(int mcip) +mca_scan(enum scan_mode mode) { struct mca_record rec; uint64_t mcg_cap, ucmask; - int count, i, recoverable; + int count, i, recoverable, valid; count = 0; recoverable = 1; ucmask = MC_STATUS_UC | MC_STATUS_PCC; /* When handling a MCE#, treat the OVER flag as non-restartable. */ - if (mcip) + if (mode == MCE) ucmask |= MC_STATUS_OVER; mcg_cap = rdmsr(MSR_MCG_CAP); for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) { - if (mca_check_status(i, &rec)) { + /* + * For a CMCI, only check banks this CPU is + * responsible for. + */ + if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i)) + continue; + + valid = mca_check_status(i, &rec); + if (valid) { count++; if (rec.mr_status & ucmask) { recoverable = 0; @@ -433,8 +537,15 @@ mca_scan(int mcip) } mca_record_entry(&rec); } + + /* + * If this is a bank this CPU monitors via CMCI, + * update the threshold. + */ + if (PCPU_GET(cmci_mask) & (1 << i)) + cmci_update(mode, i, valid, &rec); } - return (mcip ? recoverable : count); + return (mode == MCE ? recoverable : count); } /* @@ -457,7 +568,7 @@ mca_scan_cpus(void *context, int pending) continue; sched_bind(td, cpu); thread_unlock(td); - count += mca_scan(0); + count += mca_scan(POLLED); thread_lock(td); sched_unbind(td); } @@ -511,7 +622,24 @@ mca_startup(void *dummy) SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL); static void -mca_setup(void) +cmci_setup(uint64_t mcg_cap) +{ + int i; + + cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state **), + M_MCA, M_WAITOK); + cmc_banks = mcg_cap & MCG_CAP_COUNT; + for (i = 0; i <= mp_maxid; i++) + cmc_state[i] = malloc(sizeof(struct cmc_state) * cmc_banks, + M_MCA, M_WAITOK | M_ZERO); + SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, + "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + &cmc_throttle, 0, sysctl_positive_int, "I", + "Interval in seconds to throttle corrected MC interrupts"); +} + +static void +mca_setup(uint64_t mcg_cap) { mtx_init(&mca_lock, "mca", NULL, MTX_SPIN); @@ -522,13 +650,62 @@ mca_setup(void) "count", CTLFLAG_RD, &mca_count, 0, "Record count"); SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, "interval", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &mca_ticks, - 0, sysctl_mca_ticks, "I", + 0, sysctl_positive_int, "I", "Periodic interval in seconds to scan for machine checks"); SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, "records", CTLFLAG_RD, sysctl_mca_records, "Machine check records"); SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_mca_scan, "I", "Force an immediate scan for machine checks"); + if (mcg_cap & MCG_CAP_CMCI_P) + cmci_setup(mcg_cap); +} + +/* + * See if we should monitor CMCI for this bank. If CMCI_EN is already + * set in MC_CTL2, then another CPU is responsible for this bank, so + * ignore it. If CMCI_EN returns zero after being set, then this bank + * does not support CMCI_EN. If this CPU sets CMCI_EN, then it should + * now monitor this bank. + */ +static void +cmci_monitor(int i) +{ + struct cmc_state *cc; + uint64_t ctl; + + KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid))); + + ctl = rdmsr(MSR_MC_CTL2(i)); + if (ctl & MC_CTL2_CMCI_EN) + /* Already monitored by another CPU. */ + return; + + /* Set the threshold to one event for now. */ + ctl &= ~MC_CTL2_THRESHOLD; + ctl |= MC_CTL2_CMCI_EN | 1; + wrmsr(MSR_MC_CTL2(i), ctl); + ctl = rdmsr(MSR_MC_CTL2(i)); + if (!(ctl & MC_CTL2_CMCI_EN)) + /* This bank does not support CMCI. */ + return; + + cc = &cmc_state[PCPU_GET(cpuid)][i]; + + /* Determine maximum threshold. */ + ctl &= ~MC_CTL2_THRESHOLD; + ctl |= 0x7fff; + wrmsr(MSR_MC_CTL2(i), ctl); + ctl = rdmsr(MSR_MC_CTL2(i)); + cc->max_threshold = ctl & MC_CTL2_THRESHOLD; + + /* Start off with a threshold of 1. */ + ctl &= ~MC_CTL2_THRESHOLD; + ctl |= 1; + wrmsr(MSR_MC_CTL2(i), ctl); + + /* Mark this bank as monitored. */ + PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i); } /* Must be executed on each CPU. */ @@ -554,14 +731,14 @@ mca_init(void) workaround_erratum383 = 1; if (cpu_feature & CPUID_MCA) { - if (PCPU_GET(cpuid) == 0) - mca_setup(); + PCPU_SET(cmci_mask, 0); - sched_pin(); mcg_cap = rdmsr(MSR_MCG_CAP); if (mcg_cap & MCG_CAP_CTL_P) /* Enable MCA features. */ wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE); + if (PCPU_GET(cpuid) == 0) + mca_setup(mcg_cap); /* * Disable logging of level one TLB parity (L1TP) errors by @@ -597,10 +774,16 @@ mca_init(void) if (!skip) wrmsr(MSR_MC_CTL(i), ctl); + + if (mcg_cap & MCG_CAP_CMCI_P) + cmci_monitor(i); + /* Clear all errors. */ wrmsr(MSR_MC_STATUS(i), 0); } - sched_unpin(); + + if (PCPU_GET(cmci_mask) != 0) + lapic_enable_cmc(); } load_cr4(rcr4() | CR4_MCE); @@ -624,7 +807,7 @@ mca_intr(void) } /* Scan the banks and check for any non-recoverable errors. */ - recoverable = mca_scan(1); + recoverable = mca_scan(MCE); mcg_status = rdmsr(MSR_MCG_STATUS); if (!(mcg_status & MCG_STATUS_RIPV)) recoverable = 0; @@ -633,3 +816,31 @@ mca_intr(void) wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP); return (recoverable); } + +/* Called for a CMCI (correctable machine check interrupt). */ +void +cmc_intr(void) +{ + struct mca_internal *mca; + int count; + + /* + * Serialize MCA bank scanning to prevent collisions from + * sibling threads. + */ + count = mca_scan(CMCI); + + /* If we found anything, log them to the console. */ + if (count != 0) { + mtx_lock_spin(&mca_lock); + STAILQ_FOREACH(mca, &mca_records, link) { + if (!mca->logged) { + mca->logged = 1; + mtx_unlock_spin(&mca_lock); + mca_log(&mca->rec); + mtx_lock_spin(&mca_lock); + } + } + mtx_unlock_spin(&mca_lock); + } +} diff --git a/sys/amd64/include/apicreg.h b/sys/amd64/include/apicreg.h index 33f293b..fee629b 100644 --- a/sys/amd64/include/apicreg.h +++ b/sys/amd64/include/apicreg.h @@ -89,7 +89,7 @@ * 2C0 Reserved * 2D0 Reserved * 2E0 Reserved - * 2F0 Reserved + * 2F0 Local Vector Table (CMCI) R/W * 300 ICR_LOW Interrupt Command Reg. (0-31) R/W * 310 ICR_HI Interrupt Command Reg. (32-63) R/W * 320 Local Vector Table (Timer) R/W @@ -172,7 +172,7 @@ struct LAPIC { /* reserved */ PAD4; /* reserved */ PAD4; /* reserved */ PAD4; - /* reserved */ PAD4; + u_int32_t lvt_cmci; PAD3; u_int32_t icr_lo; PAD3; u_int32_t icr_hi; PAD3; u_int32_t lvt_timer; PAD3; diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h index 8dc04a9..4968842 100644 --- a/sys/amd64/include/apicvar.h +++ b/sys/amd64/include/apicvar.h @@ -108,8 +108,9 @@ #define APIC_LOCAL_INTS 240 #define APIC_ERROR_INT APIC_LOCAL_INTS #define APIC_THERMAL_INT (APIC_LOCAL_INTS + 1) +#define APIC_CMC_INT (APIC_LOCAL_INTS + 2) -#define APIC_IPI_INTS (APIC_LOCAL_INTS + 2) +#define APIC_IPI_INTS (APIC_LOCAL_INTS + 3) #define IPI_RENDEZVOUS (APIC_IPI_INTS) /* Inter-CPU rendezvous. */ #define IPI_INVLTLB (APIC_IPI_INTS + 1) /* TLB Shootdown IPIs */ #define IPI_INVLPG (APIC_IPI_INTS + 2) @@ -142,7 +143,8 @@ #define LVT_ERROR 3 #define LVT_PMC 4 #define LVT_THERMAL 5 -#define LVT_MAX LVT_THERMAL +#define LVT_CMCI 6 +#define LVT_MAX LVT_CMCI #ifndef LOCORE @@ -178,8 +180,8 @@ struct apic_enumerator { inthand_t IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3), IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6), - IDTVEC(apic_isr7), IDTVEC(errorint), IDTVEC(spuriousint), - IDTVEC(timerint); + IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint), + IDTVEC(spuriousint), IDTVEC(timerint); extern vm_paddr_t lapic_paddr; extern int apic_cpuids[]; @@ -209,6 +211,7 @@ void lapic_create(u_int apic_id, int boot_cpu); void lapic_disable(void); void lapic_disable_pmc(void); void lapic_dump(const char *str); +void lapic_enable_cmc(void); int lapic_enable_pmc(void); void lapic_eoi(void); int lapic_id(void); @@ -217,6 +220,7 @@ int lapic_intr_pending(u_int vector); void lapic_ipi_raw(register_t icrlo, u_int dest); void lapic_ipi_vectored(u_int vector, int dest); int lapic_ipi_wait(int delay); +void lapic_handle_cmc(void); void lapic_handle_error(void); void lapic_handle_intr(int vector, struct trapframe *frame); void lapic_handle_timer(struct trapframe *frame); diff --git a/sys/amd64/include/mca.h b/sys/amd64/include/mca.h index bc09480..951750f 100644 --- a/sys/amd64/include/mca.h +++ b/sys/amd64/include/mca.h @@ -46,6 +46,7 @@ struct mca_record { #ifdef _KERNEL +void cmc_intr(void); void mca_init(void); int mca_intr(void); diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h index 30f7a7b..a55627f 100644 --- a/sys/amd64/include/pcpu.h +++ b/sys/amd64/include/pcpu.h @@ -75,7 +75,8 @@ /* Pointer to the CPU LDT descriptor */ \ struct system_segment_descriptor *pc_ldt; \ /* Pointer to the CPU TSS descriptor */ \ - struct system_segment_descriptor *pc_tss \ + struct system_segment_descriptor *pc_tss; \ + u_int pc_cmci_mask /* MCx banks for CMCI */ \ PCPU_XEN_FIELDS #ifdef _KERNEL diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h index 895619c..4e19d8e 100644 --- a/sys/amd64/include/specialreg.h +++ b/sys/amd64/include/specialreg.h @@ -385,7 +385,7 @@ #define MC_STATUS_VAL 0x8000000000000000 #define MC_MISC_RA_LSB 0x000000000000003f /* If MCG_CAP_SER_P */ #define MC_MISC_ADDRESS_MODE 0x00000000000001c0 /* If MCG_CAP_SER_P */ -#define MC_CTL2_THRESHOLD 0x0000000000003fff +#define MC_CTL2_THRESHOLD 0x0000000000007fff #define MC_CTL2_CMCI_EN 0x0000000040000000 /* diff --git a/sys/i386/i386/apic_vector.s b/sys/i386/i386/apic_vector.s index c4c5b01..e3000e1 100644 --- a/sys/i386/i386/apic_vector.s +++ b/sys/i386/i386/apic_vector.s @@ -111,6 +111,19 @@ IDTVEC(timerint) jmp doreti /* + * Local APIC CMCI handler. + */ + .text + SUPERALIGN_TEXT +IDTVEC(cmcint) + PUSH_FRAME + SET_KERNEL_SREGS + FAKE_MCOUNT(TF_EIP(%esp)) + call lapic_handle_cmc + MEXITCOUNT + jmp doreti + +/* * Local APIC error interrupt handler. */ .text diff --git a/sys/i386/i386/mca.c b/sys/i386/i386/mca.c index 8d33b51..6ede87b 100644 --- a/sys/i386/i386/mca.c +++ b/sys/i386/i386/mca.c @@ -33,6 +33,8 @@ __FBSDID("$FreeBSD$"); #include <sys/param.h> +#include <sys/bus.h> +#include <sys/interrupt.h> #include <sys/kernel.h> #include <sys/lock.h> #include <sys/malloc.h> @@ -43,11 +45,29 @@ __FBSDID("$FreeBSD$"); #include <sys/sysctl.h> #include <sys/systm.h> #include <sys/taskqueue.h> +#include <machine/intr_machdep.h> +#include <machine/apicvar.h> #include <machine/cputypes.h> #include <machine/mca.h> #include <machine/md_var.h> #include <machine/specialreg.h> +/* Modes for mca_scan() */ +enum scan_mode { + POLLED, + MCE, + CMCI, +}; + +/* + * State maintained for each monitored MCx bank to control the + * corrected machine check interrupt threshold. + */ +struct cmc_state { + int max_threshold; + int last_intr; +}; + struct mca_internal { struct mca_record rec; int logged; @@ -79,19 +99,22 @@ static struct callout mca_timer; static int mca_ticks = 3600; /* Check hourly by default. */ static struct task mca_task; static struct mtx mca_lock; +static struct cmc_state **cmc_state; /* Indexed by cpuid, bank */ +static int cmc_banks; +static int cmc_throttle = 60; /* Time in seconds to throttle CMCI. */ static int -sysctl_mca_ticks(SYSCTL_HANDLER_ARGS) +sysctl_positive_int(SYSCTL_HANDLER_ARGS) { int error, value; - value = mca_ticks; + value = *(int *)arg1; error = sysctl_handle_int(oidp, &value, 0, req); if (error || req->newptr == NULL) return (error); if (value <= 0) return (EINVAL); - mca_ticks = value; + *(int *)arg1 = value; return (0); } @@ -401,31 +424,112 @@ mca_record_entry(const struct mca_record *record) } /* + * Update the interrupt threshold for a CMCI. The strategy is to use + * a low trigger that interrupts as soon as the first event occurs. + * However, if a steady stream of events arrive, the threshold is + * increased until the interrupts are throttled to once every + * cmc_throttle seconds or the periodic scan. If a periodic scan + * finds that the threshold is too high, it is lowered. + */ +static void +cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec) +{ + struct cmc_state *cc; + uint64_t ctl; + u_int delta; + int count, limit; + + /* Fetch the current limit for this bank. */ + cc = &cmc_state[PCPU_GET(cpuid)][bank]; + ctl = rdmsr(MSR_MC_CTL2(bank)); + count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38; + delta = (u_int)(ticks - cc->last_intr); + + /* + * If an interrupt was received less than cmc_throttle seconds + * since the previous interrupt and the count from the current + * event is greater than or equal to the current threshold, + * double the threshold up to the max. + */ + if (mode == CMCI && valid) { + limit = ctl & MC_CTL2_THRESHOLD; + if (delta < cmc_throttle && count >= limit && + limit < cc->max_threshold) { + limit = min(limit << 1, cc->max_threshold); + ctl &= ~MC_CTL2_THRESHOLD; + ctl |= limit; + wrmsr(MSR_MC_CTL2(bank), limit); + } + cc->last_intr = ticks; + return; + } + + /* + * When the banks are polled, check to see if the threshold + * should be lowered. + */ + if (mode != POLLED) + return; + + /* If a CMCI occured recently, do nothing for now. */ + if (delta < cmc_throttle) + return; + + /* + * Compute a new limit based on the average rate of events per + * cmc_throttle seconds since the last interrupt. + */ + if (valid) { + count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38; + limit = count * cmc_throttle / delta; + if (limit <= 0) + limit = 1; + else if (limit > cc->max_threshold) + limit = cc->max_threshold; + } else + limit = 1; + if ((ctl & MC_CTL2_THRESHOLD) != limit) { + ctl &= ~MC_CTL2_THRESHOLD; + ctl |= limit; + wrmsr(MSR_MC_CTL2(bank), limit); + } +} + +/* * This scans all the machine check banks of the current CPU to see if * there are any machine checks. Any non-recoverable errors are * reported immediately via mca_log(). The current thread must be - * pinned when this is called. The 'mcip' parameter indicates if we - * are being called from the MC exception handler. In that case this - * function returns true if the system is restartable. Otherwise, it - * returns a count of the number of valid MC records found. + * pinned when this is called. The 'mode' parameter indicates if we + * are being called from the MC exception handler, the CMCI handler, + * or the periodic poller. In the MC exception case this function + * returns true if the system is restartable. Otherwise, it returns a + * count of the number of valid MC records found. */ static int -mca_scan(int mcip) +mca_scan(enum scan_mode mode) { struct mca_record rec; uint64_t mcg_cap, ucmask; - int count, i, recoverable; + int count, i, recoverable, valid; count = 0; recoverable = 1; ucmask = MC_STATUS_UC | MC_STATUS_PCC; /* When handling a MCE#, treat the OVER flag as non-restartable. */ - if (mcip) + if (mode == MCE) ucmask |= MC_STATUS_OVER; mcg_cap = rdmsr(MSR_MCG_CAP); for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) { - if (mca_check_status(i, &rec)) { + /* + * For a CMCI, only check banks this CPU is + * responsible for. + */ + if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i)) + continue; + + valid = mca_check_status(i, &rec); + if (valid) { count++; if (rec.mr_status & ucmask) { recoverable = 0; @@ -433,8 +537,15 @@ mca_scan(int mcip) } mca_record_entry(&rec); } + + /* + * If this is a bank this CPU monitors via CMCI, + * update the threshold. + */ + if (PCPU_GET(cmci_mask) & (1 << i)) + cmci_update(mode, i, valid, &rec); } - return (mcip ? recoverable : count); + return (mode == MCE ? recoverable : count); } /* @@ -457,7 +568,7 @@ mca_scan_cpus(void *context, int pending) continue; sched_bind(td, cpu); thread_unlock(td); - count += mca_scan(0); + count += mca_scan(POLLED); thread_lock(td); sched_unbind(td); } @@ -511,7 +622,24 @@ mca_startup(void *dummy) SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL); static void -mca_setup(void) +cmci_setup(uint64_t mcg_cap) +{ + int i; + + cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state **), + M_MCA, M_WAITOK); + cmc_banks = mcg_cap & MCG_CAP_COUNT; + for (i = 0; i <= mp_maxid; i++) + cmc_state[i] = malloc(sizeof(struct cmc_state) * cmc_banks, + M_MCA, M_WAITOK | M_ZERO); + SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, + "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, + &cmc_throttle, 0, sysctl_positive_int, "I", + "Interval in seconds to throttle corrected MC interrupts"); +} + +static void +mca_setup(uint64_t mcg_cap) { mtx_init(&mca_lock, "mca", NULL, MTX_SPIN); @@ -522,13 +650,62 @@ mca_setup(void) "count", CTLFLAG_RD, &mca_count, 0, "Record count"); SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, "interval", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &mca_ticks, - 0, sysctl_mca_ticks, "I", + 0, sysctl_positive_int, "I", "Periodic interval in seconds to scan for machine checks"); SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, "records", CTLFLAG_RD, sysctl_mca_records, "Machine check records"); SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO, "force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0, sysctl_mca_scan, "I", "Force an immediate scan for machine checks"); + if (mcg_cap & MCG_CAP_CMCI_P) + cmci_setup(mcg_cap); +} + +/* + * See if we should monitor CMCI for this bank. If CMCI_EN is already + * set in MC_CTL2, then another CPU is responsible for this bank, so + * ignore it. If CMCI_EN returns zero after being set, then this bank + * does not support CMCI_EN. If this CPU sets CMCI_EN, then it should + * now monitor this bank. + */ +static void +cmci_monitor(int i) +{ + struct cmc_state *cc; + uint64_t ctl; + + KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid))); + + ctl = rdmsr(MSR_MC_CTL2(i)); + if (ctl & MC_CTL2_CMCI_EN) + /* Already monitored by another CPU. */ + return; + + /* Set the threshold to one event for now. */ + ctl &= ~MC_CTL2_THRESHOLD; + ctl |= MC_CTL2_CMCI_EN | 1; + wrmsr(MSR_MC_CTL2(i), ctl); + ctl = rdmsr(MSR_MC_CTL2(i)); + if (!(ctl & MC_CTL2_CMCI_EN)) + /* This bank does not support CMCI. */ + return; + + cc = &cmc_state[PCPU_GET(cpuid)][i]; + + /* Determine maximum threshold. */ + ctl &= ~MC_CTL2_THRESHOLD; + ctl |= 0x7fff; + wrmsr(MSR_MC_CTL2(i), ctl); + ctl = rdmsr(MSR_MC_CTL2(i)); + cc->max_threshold = ctl & MC_CTL2_THRESHOLD; + + /* Start off with a threshold of 1. */ + ctl &= ~MC_CTL2_THRESHOLD; + ctl |= 1; + wrmsr(MSR_MC_CTL2(i), ctl); + + /* Mark this bank as monitored. */ + PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i); } /* Must be executed on each CPU. */ @@ -554,14 +731,14 @@ mca_init(void) workaround_erratum383 = 1; if (cpu_feature & CPUID_MCA) { - if (PCPU_GET(cpuid) == 0) - mca_setup(); + PCPU_SET(cmci_mask, 0); - sched_pin(); mcg_cap = rdmsr(MSR_MCG_CAP); if (mcg_cap & MCG_CAP_CTL_P) /* Enable MCA features. */ wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE); + if (PCPU_GET(cpuid) == 0) + mca_setup(mcg_cap); /* * Disable logging of level one TLB parity (L1TP) errors by @@ -597,10 +774,16 @@ mca_init(void) if (!skip) wrmsr(MSR_MC_CTL(i), ctl); + + if (mcg_cap & MCG_CAP_CMCI_P) + cmci_monitor(i); + /* Clear all errors. */ wrmsr(MSR_MC_STATUS(i), 0); } - sched_unpin(); + + if (PCPU_GET(cmci_mask) != 0) + lapic_enable_cmc(); } load_cr4(rcr4() | CR4_MCE); @@ -624,7 +807,7 @@ mca_intr(void) } /* Scan the banks and check for any non-recoverable errors. */ - recoverable = mca_scan(1); + recoverable = mca_scan(MCE); mcg_status = rdmsr(MSR_MCG_STATUS); if (!(mcg_status & MCG_STATUS_RIPV)) recoverable = 0; @@ -633,3 +816,31 @@ mca_intr(void) wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP); return (recoverable); } + +/* Called for a CMCI (correctable machine check interrupt). */ +void +cmc_intr(void) +{ + struct mca_internal *mca; + int count; + + /* + * Serialize MCA bank scanning to prevent collisions from + * sibling threads. + */ + count = mca_scan(CMCI); + + /* If we found anything, log them to the console. */ + if (count != 0) { + mtx_lock_spin(&mca_lock); + STAILQ_FOREACH(mca, &mca_records, link) { + if (!mca->logged) { + mca->logged = 1; + mtx_unlock_spin(&mca_lock); + mca_log(&mca->rec); + mtx_lock_spin(&mca_lock); + } + } + mtx_unlock_spin(&mca_lock); + } +} diff --git a/sys/i386/include/apicreg.h b/sys/i386/include/apicreg.h index 33f293b..fee629b 100644 --- a/sys/i386/include/apicreg.h +++ b/sys/i386/include/apicreg.h @@ -89,7 +89,7 @@ * 2C0 Reserved * 2D0 Reserved * 2E0 Reserved - * 2F0 Reserved + * 2F0 Local Vector Table (CMCI) R/W * 300 ICR_LOW Interrupt Command Reg. (0-31) R/W * 310 ICR_HI Interrupt Command Reg. (32-63) R/W * 320 Local Vector Table (Timer) R/W @@ -172,7 +172,7 @@ struct LAPIC { /* reserved */ PAD4; /* reserved */ PAD4; /* reserved */ PAD4; - /* reserved */ PAD4; + u_int32_t lvt_cmci; PAD3; u_int32_t icr_lo; PAD3; u_int32_t icr_hi; PAD3; u_int32_t lvt_timer; PAD3; diff --git a/sys/i386/include/apicvar.h b/sys/i386/include/apicvar.h index adfc1e8..f917357 100644 --- a/sys/i386/include/apicvar.h +++ b/sys/i386/include/apicvar.h @@ -108,7 +108,8 @@ #define APIC_LOCAL_INTS 240 #define APIC_ERROR_INT APIC_LOCAL_INTS #define APIC_THERMAL_INT (APIC_LOCAL_INTS + 1) -#define APIC_IPI_INTS (APIC_LOCAL_INTS + 2) +#define APIC_CMC_INT (APIC_LOCAL_INTS + 2) +#define APIC_IPI_INTS (APIC_LOCAL_INTS + 3) #define IPI_RENDEZVOUS (APIC_IPI_INTS) /* Inter-CPU rendezvous. */ #define IPI_INVLTLB (APIC_IPI_INTS + 1) /* TLB Shootdown IPIs */ @@ -135,7 +136,8 @@ #define APIC_LOCAL_INTS 240 #define APIC_ERROR_INT APIC_LOCAL_INTS #define APIC_THERMAL_INT (APIC_LOCAL_INTS + 1) -#define APIC_IPI_INTS (APIC_LOCAL_INTS + 2) +#define APIC_CMC_INT (APIC_LOCAL_INTS + 2) +#define APIC_IPI_INTS (APIC_LOCAL_INTS + 3) #define IPI_RENDEZVOUS (APIC_IPI_INTS) /* Inter-CPU rendezvous. */ #define IPI_INVLTLB (APIC_IPI_INTS + 1) /* TLB Shootdown IPIs */ @@ -170,7 +172,8 @@ #define LVT_ERROR 3 #define LVT_PMC 4 #define LVT_THERMAL 5 -#define LVT_MAX LVT_THERMAL +#define LVT_CMCI 6 +#define LVT_MAX LVT_CMCI #ifndef LOCORE @@ -206,8 +209,8 @@ struct apic_enumerator { inthand_t IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3), IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6), - IDTVEC(apic_isr7), IDTVEC(errorint), IDTVEC(spuriousint), - IDTVEC(timerint); + IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint), + IDTVEC(spuriousint), IDTVEC(timerint); extern vm_paddr_t lapic_paddr; extern int apic_cpuids[]; @@ -237,6 +240,7 @@ void lapic_create(u_int apic_id, int boot_cpu); void lapic_disable(void); void lapic_disable_pmc(void); void lapic_dump(const char *str); +void lapic_enable_cmc(void); int lapic_enable_pmc(void); void lapic_eoi(void); int lapic_id(void); @@ -245,6 +249,7 @@ int lapic_intr_pending(u_int vector); void lapic_ipi_raw(register_t icrlo, u_int dest); void lapic_ipi_vectored(u_int vector, int dest); int lapic_ipi_wait(int delay); +void lapic_handle_cmc(void); void lapic_handle_error(void); void lapic_handle_intr(int vector, struct trapframe *frame); void lapic_handle_timer(struct trapframe *frame); diff --git a/sys/i386/include/mca.h b/sys/i386/include/mca.h index bc09480..951750f 100644 --- a/sys/i386/include/mca.h +++ b/sys/i386/include/mca.h @@ -46,6 +46,7 @@ struct mca_record { #ifdef _KERNEL +void cmc_intr(void); void mca_init(void); int mca_intr(void); diff --git a/sys/i386/include/pcpu.h b/sys/i386/include/pcpu.h index 6cc03d0..5345eb6 100644 --- a/sys/i386/include/pcpu.h +++ b/sys/i386/include/pcpu.h @@ -76,6 +76,7 @@ struct shadow_time_info { u_int pc_acpi_id; /* ACPI CPU id */ \ u_int pc_apic_id; \ int pc_private_tss; /* Flag indicating private tss*/\ + u_int pc_cmci_mask; /* MCx banks for CMCI */ \ u_int pc_cr3; /* track cr3 for R1/R3*/ \ u_int pc_pdir; \ u_int pc_lazypmap; \ @@ -102,7 +103,8 @@ struct shadow_time_info { int pc_currentldt; \ u_int pc_acpi_id; /* ACPI CPU id */ \ u_int pc_apic_id; \ - int pc_private_tss /* Flag indicating private tss */ + int pc_private_tss; /* Flag indicating private tss*/\ + u_int pc_cmci_mask /* MCx banks for CMCI */ \ #endif diff --git a/sys/i386/include/specialreg.h b/sys/i386/include/specialreg.h index efcf924..b550574 100644 --- a/sys/i386/include/specialreg.h +++ b/sys/i386/include/specialreg.h @@ -454,7 +454,7 @@ #define MC_STATUS_VAL 0x8000000000000000 #define MC_MISC_RA_LSB 0x000000000000003f /* If MCG_CAP_SER_P */ #define MC_MISC_ADDRESS_MODE 0x00000000000001c0 /* If MCG_CAP_SER_P */ -#define MC_CTL2_THRESHOLD 0x0000000000003fff +#define MC_CTL2_THRESHOLD 0x0000000000007fff #define MC_CTL2_CMCI_EN 0x0000000040000000 /* diff --git a/sys/x86/x86/local_apic.c b/sys/x86/x86/local_apic.c index f40026b..7fec4f6 100644 --- a/sys/x86/x86/local_apic.c +++ b/sys/x86/x86/local_apic.c @@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$"); #include <machine/frame.h> #include <machine/intr_machdep.h> #include <machine/apicvar.h> +#include <machine/mca.h> #include <machine/md_var.h> #include <machine/smp.h> #include <machine/specialreg.h> @@ -130,6 +131,7 @@ static struct lvt lvts[LVT_MAX + 1] = { { 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT }, /* Error */ { 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 }, /* PMC */ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT }, /* Thermal */ + { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT }, /* CMCI */ }; static inthand_t *ioint_handlers[] = { @@ -235,6 +237,9 @@ lapic_init(vm_paddr_t addr) setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC); /* XXX: Thermal interrupt */ + + /* Local APIC CMCI. */ + setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_APICT, SEL_KPL, GSEL_APIC); } /* @@ -260,7 +265,7 @@ lapic_create(u_int apic_id, int boot_cpu) */ lapics[apic_id].la_present = 1; lapics[apic_id].la_id = apic_id; - for (i = 0; i < LVT_MAX; i++) { + for (i = 0; i <= LVT_MAX; i++) { lapics[apic_id].la_lvts[i] = lvts[i]; lapics[apic_id].la_lvts[i].lvt_active = 0; } @@ -290,6 +295,7 @@ lapic_dump(const char* str) printf(" timer: 0x%08x therm: 0x%08x err: 0x%08x pmc: 0x%08x\n", lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error, lapic->lvt_pcint); + printf(" cmci: 0x%08x\n", lapic->lvt_cmci); } void @@ -341,6 +347,10 @@ lapic_setup(int boot) /* XXX: Thermal LVT */ + /* Program the CMCI LVT entry if present. */ + if (maxlvt >= LVT_CMCI) + lapic->lvt_cmci = lvt_mode(la, LVT_CMCI, lapic->lvt_cmci); + intr_restore(eflags); } @@ -838,6 +848,34 @@ lapic_timer_enable_intr(void) } void +lapic_handle_cmc(void) +{ + + lapic_eoi(); + cmc_intr(); +} + +/* + * Called from the mca_init() to activate the CMC interrupt if this CPU is + * responsible for monitoring any MC banks for CMC events. Since mca_init() + * is called prior to lapic_setup() during boot, this just needs to unmask + * this CPU's LVT_CMCI entry. + */ +void +lapic_enable_cmc(void) +{ + u_int apic_id; + + apic_id = PCPU_GET(apic_id); + KASSERT(lapics[apic_id].la_present, + ("%s: missing APIC %u", __func__, apic_id)); + lapics[apic_id].la_lvts[LVT_CMCI].lvt_masked = 0; + lapics[apic_id].la_lvts[LVT_CMCI].lvt_active = 1; + if (bootverbose) + printf("lapic%u: CMCI unmasked\n", apic_id); +} + +void lapic_handle_error(void) { u_int32_t esr; |