summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--sys/amd64/amd64/apic_vector.S12
-rw-r--r--sys/amd64/amd64/mca.c251
-rw-r--r--sys/amd64/include/apicreg.h4
-rw-r--r--sys/amd64/include/apicvar.h12
-rw-r--r--sys/amd64/include/mca.h1
-rw-r--r--sys/amd64/include/pcpu.h3
-rw-r--r--sys/amd64/include/specialreg.h2
-rw-r--r--sys/i386/i386/apic_vector.s13
-rw-r--r--sys/i386/i386/mca.c251
-rw-r--r--sys/i386/include/apicreg.h4
-rw-r--r--sys/i386/include/apicvar.h15
-rw-r--r--sys/i386/include/mca.h1
-rw-r--r--sys/i386/include/pcpu.h4
-rw-r--r--sys/i386/include/specialreg.h2
-rw-r--r--sys/x86/x86/local_apic.c40
15 files changed, 557 insertions, 58 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index 4cfc18b..1c044b8 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -105,6 +105,18 @@ IDTVEC(timerint)
jmp doreti
/*
+ * Local APIC CMCI handler.
+ */
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(cmcint)
+ PUSH_FRAME
+ FAKE_MCOUNT(TF_RIP(%rsp))
+ call lapic_handle_cmc
+ MEXITCOUNT
+ jmp doreti
+
+/*
* Local APIC error interrupt handler.
*/
.text
diff --git a/sys/amd64/amd64/mca.c b/sys/amd64/amd64/mca.c
index ccbab17..f3b7e9e 100644
--- a/sys/amd64/amd64/mca.c
+++ b/sys/amd64/amd64/mca.c
@@ -33,6 +33,8 @@
__FBSDID("$FreeBSD$");
#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
@@ -43,11 +45,29 @@ __FBSDID("$FreeBSD$");
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/taskqueue.h>
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
#include <machine/cputypes.h>
#include <machine/mca.h>
#include <machine/md_var.h>
#include <machine/specialreg.h>
+/* Modes for mca_scan() */
+enum scan_mode {
+ POLLED,
+ MCE,
+ CMCI,
+};
+
+/*
+ * State maintained for each monitored MCx bank to control the
+ * corrected machine check interrupt threshold.
+ */
+struct cmc_state {
+ int max_threshold;
+ int last_intr;
+};
+
struct mca_internal {
struct mca_record rec;
int logged;
@@ -79,19 +99,22 @@ static struct callout mca_timer;
static int mca_ticks = 3600; /* Check hourly by default. */
static struct task mca_task;
static struct mtx mca_lock;
+static struct cmc_state **cmc_state; /* Indexed by cpuid, bank */
+static int cmc_banks;
+static int cmc_throttle = 60; /* Time in seconds to throttle CMCI. */
static int
-sysctl_mca_ticks(SYSCTL_HANDLER_ARGS)
+sysctl_positive_int(SYSCTL_HANDLER_ARGS)
{
int error, value;
- value = mca_ticks;
+ value = *(int *)arg1;
error = sysctl_handle_int(oidp, &value, 0, req);
if (error || req->newptr == NULL)
return (error);
if (value <= 0)
return (EINVAL);
- mca_ticks = value;
+ *(int *)arg1 = value;
return (0);
}
@@ -401,31 +424,112 @@ mca_record_entry(const struct mca_record *record)
}
/*
+ * Update the interrupt threshold for a CMCI. The strategy is to use
+ * a low trigger that interrupts as soon as the first event occurs.
+ * However, if a steady stream of events arrive, the threshold is
+ * increased until the interrupts are throttled to once every
+ * cmc_throttle seconds or the periodic scan. If a periodic scan
+ * finds that the threshold is too high, it is lowered.
+ */
+static void
+cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
+{
+ struct cmc_state *cc;
+ uint64_t ctl;
+ u_int delta;
+ int count, limit;
+
+ /* Fetch the current limit for this bank. */
+ cc = &cmc_state[PCPU_GET(cpuid)][bank];
+ ctl = rdmsr(MSR_MC_CTL2(bank));
+ count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
+ delta = (u_int)(ticks - cc->last_intr);
+
+ /*
+ * If an interrupt was received less than cmc_throttle seconds
+ * since the previous interrupt and the count from the current
+ * event is greater than or equal to the current threshold,
+ * double the threshold up to the max.
+ */
+ if (mode == CMCI && valid) {
+ limit = ctl & MC_CTL2_THRESHOLD;
+ if (delta < cmc_throttle && count >= limit &&
+ limit < cc->max_threshold) {
+ limit = min(limit << 1, cc->max_threshold);
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= limit;
+ wrmsr(MSR_MC_CTL2(bank), limit);
+ }
+ cc->last_intr = ticks;
+ return;
+ }
+
+ /*
+ * When the banks are polled, check to see if the threshold
+ * should be lowered.
+ */
+ if (mode != POLLED)
+ return;
+
+ /* If a CMCI occured recently, do nothing for now. */
+ if (delta < cmc_throttle)
+ return;
+
+ /*
+ * Compute a new limit based on the average rate of events per
+ * cmc_throttle seconds since the last interrupt.
+ */
+ if (valid) {
+ count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
+ limit = count * cmc_throttle / delta;
+ if (limit <= 0)
+ limit = 1;
+ else if (limit > cc->max_threshold)
+ limit = cc->max_threshold;
+ } else
+ limit = 1;
+ if ((ctl & MC_CTL2_THRESHOLD) != limit) {
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= limit;
+ wrmsr(MSR_MC_CTL2(bank), limit);
+ }
+}
+
+/*
* This scans all the machine check banks of the current CPU to see if
* there are any machine checks. Any non-recoverable errors are
* reported immediately via mca_log(). The current thread must be
- * pinned when this is called. The 'mcip' parameter indicates if we
- * are being called from the MC exception handler. In that case this
- * function returns true if the system is restartable. Otherwise, it
- * returns a count of the number of valid MC records found.
+ * pinned when this is called. The 'mode' parameter indicates if we
+ * are being called from the MC exception handler, the CMCI handler,
+ * or the periodic poller. In the MC exception case this function
+ * returns true if the system is restartable. Otherwise, it returns a
+ * count of the number of valid MC records found.
*/
static int
-mca_scan(int mcip)
+mca_scan(enum scan_mode mode)
{
struct mca_record rec;
uint64_t mcg_cap, ucmask;
- int count, i, recoverable;
+ int count, i, recoverable, valid;
count = 0;
recoverable = 1;
ucmask = MC_STATUS_UC | MC_STATUS_PCC;
/* When handling a MCE#, treat the OVER flag as non-restartable. */
- if (mcip)
+ if (mode == MCE)
ucmask |= MC_STATUS_OVER;
mcg_cap = rdmsr(MSR_MCG_CAP);
for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
- if (mca_check_status(i, &rec)) {
+ /*
+ * For a CMCI, only check banks this CPU is
+ * responsible for.
+ */
+ if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
+ continue;
+
+ valid = mca_check_status(i, &rec);
+ if (valid) {
count++;
if (rec.mr_status & ucmask) {
recoverable = 0;
@@ -433,8 +537,15 @@ mca_scan(int mcip)
}
mca_record_entry(&rec);
}
+
+ /*
+ * If this is a bank this CPU monitors via CMCI,
+ * update the threshold.
+ */
+ if (PCPU_GET(cmci_mask) & (1 << i))
+ cmci_update(mode, i, valid, &rec);
}
- return (mcip ? recoverable : count);
+ return (mode == MCE ? recoverable : count);
}
/*
@@ -457,7 +568,7 @@ mca_scan_cpus(void *context, int pending)
continue;
sched_bind(td, cpu);
thread_unlock(td);
- count += mca_scan(0);
+ count += mca_scan(POLLED);
thread_lock(td);
sched_unbind(td);
}
@@ -511,7 +622,24 @@ mca_startup(void *dummy)
SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
static void
-mca_setup(void)
+cmci_setup(uint64_t mcg_cap)
+{
+ int i;
+
+ cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state **),
+ M_MCA, M_WAITOK);
+ cmc_banks = mcg_cap & MCG_CAP_COUNT;
+ for (i = 0; i <= mp_maxid; i++)
+ cmc_state[i] = malloc(sizeof(struct cmc_state) * cmc_banks,
+ M_MCA, M_WAITOK | M_ZERO);
+ SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
+ "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ &cmc_throttle, 0, sysctl_positive_int, "I",
+ "Interval in seconds to throttle corrected MC interrupts");
+}
+
+static void
+mca_setup(uint64_t mcg_cap)
{
mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
@@ -522,13 +650,62 @@ mca_setup(void)
"count", CTLFLAG_RD, &mca_count, 0, "Record count");
SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
"interval", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &mca_ticks,
- 0, sysctl_mca_ticks, "I",
+ 0, sysctl_positive_int, "I",
"Periodic interval in seconds to scan for machine checks");
SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
"records", CTLFLAG_RD, sysctl_mca_records, "Machine check records");
SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
"force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
+ if (mcg_cap & MCG_CAP_CMCI_P)
+ cmci_setup(mcg_cap);
+}
+
+/*
+ * See if we should monitor CMCI for this bank. If CMCI_EN is already
+ * set in MC_CTL2, then another CPU is responsible for this bank, so
+ * ignore it. If CMCI_EN returns zero after being set, then this bank
+ * does not support CMCI_EN. If this CPU sets CMCI_EN, then it should
+ * now monitor this bank.
+ */
+static void
+cmci_monitor(int i)
+{
+ struct cmc_state *cc;
+ uint64_t ctl;
+
+ KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
+
+ ctl = rdmsr(MSR_MC_CTL2(i));
+ if (ctl & MC_CTL2_CMCI_EN)
+ /* Already monitored by another CPU. */
+ return;
+
+ /* Set the threshold to one event for now. */
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= MC_CTL2_CMCI_EN | 1;
+ wrmsr(MSR_MC_CTL2(i), ctl);
+ ctl = rdmsr(MSR_MC_CTL2(i));
+ if (!(ctl & MC_CTL2_CMCI_EN))
+ /* This bank does not support CMCI. */
+ return;
+
+ cc = &cmc_state[PCPU_GET(cpuid)][i];
+
+ /* Determine maximum threshold. */
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= 0x7fff;
+ wrmsr(MSR_MC_CTL2(i), ctl);
+ ctl = rdmsr(MSR_MC_CTL2(i));
+ cc->max_threshold = ctl & MC_CTL2_THRESHOLD;
+
+ /* Start off with a threshold of 1. */
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= 1;
+ wrmsr(MSR_MC_CTL2(i), ctl);
+
+ /* Mark this bank as monitored. */
+ PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
}
/* Must be executed on each CPU. */
@@ -554,14 +731,14 @@ mca_init(void)
workaround_erratum383 = 1;
if (cpu_feature & CPUID_MCA) {
- if (PCPU_GET(cpuid) == 0)
- mca_setup();
+ PCPU_SET(cmci_mask, 0);
- sched_pin();
mcg_cap = rdmsr(MSR_MCG_CAP);
if (mcg_cap & MCG_CAP_CTL_P)
/* Enable MCA features. */
wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
+ if (PCPU_GET(cpuid) == 0)
+ mca_setup(mcg_cap);
/*
* Disable logging of level one TLB parity (L1TP) errors by
@@ -597,10 +774,16 @@ mca_init(void)
if (!skip)
wrmsr(MSR_MC_CTL(i), ctl);
+
+ if (mcg_cap & MCG_CAP_CMCI_P)
+ cmci_monitor(i);
+
/* Clear all errors. */
wrmsr(MSR_MC_STATUS(i), 0);
}
- sched_unpin();
+
+ if (PCPU_GET(cmci_mask) != 0)
+ lapic_enable_cmc();
}
load_cr4(rcr4() | CR4_MCE);
@@ -624,7 +807,7 @@ mca_intr(void)
}
/* Scan the banks and check for any non-recoverable errors. */
- recoverable = mca_scan(1);
+ recoverable = mca_scan(MCE);
mcg_status = rdmsr(MSR_MCG_STATUS);
if (!(mcg_status & MCG_STATUS_RIPV))
recoverable = 0;
@@ -633,3 +816,31 @@ mca_intr(void)
wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
return (recoverable);
}
+
+/* Called for a CMCI (correctable machine check interrupt). */
+void
+cmc_intr(void)
+{
+ struct mca_internal *mca;
+ int count;
+
+ /*
+ * Serialize MCA bank scanning to prevent collisions from
+ * sibling threads.
+ */
+ count = mca_scan(CMCI);
+
+ /* If we found anything, log them to the console. */
+ if (count != 0) {
+ mtx_lock_spin(&mca_lock);
+ STAILQ_FOREACH(mca, &mca_records, link) {
+ if (!mca->logged) {
+ mca->logged = 1;
+ mtx_unlock_spin(&mca_lock);
+ mca_log(&mca->rec);
+ mtx_lock_spin(&mca_lock);
+ }
+ }
+ mtx_unlock_spin(&mca_lock);
+ }
+}
diff --git a/sys/amd64/include/apicreg.h b/sys/amd64/include/apicreg.h
index 33f293b..fee629b 100644
--- a/sys/amd64/include/apicreg.h
+++ b/sys/amd64/include/apicreg.h
@@ -89,7 +89,7 @@
* 2C0 Reserved
* 2D0 Reserved
* 2E0 Reserved
- * 2F0 Reserved
+ * 2F0 Local Vector Table (CMCI) R/W
* 300 ICR_LOW Interrupt Command Reg. (0-31) R/W
* 310 ICR_HI Interrupt Command Reg. (32-63) R/W
* 320 Local Vector Table (Timer) R/W
@@ -172,7 +172,7 @@ struct LAPIC {
/* reserved */ PAD4;
/* reserved */ PAD4;
/* reserved */ PAD4;
- /* reserved */ PAD4;
+ u_int32_t lvt_cmci; PAD3;
u_int32_t icr_lo; PAD3;
u_int32_t icr_hi; PAD3;
u_int32_t lvt_timer; PAD3;
diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h
index 8dc04a9..4968842 100644
--- a/sys/amd64/include/apicvar.h
+++ b/sys/amd64/include/apicvar.h
@@ -108,8 +108,9 @@
#define APIC_LOCAL_INTS 240
#define APIC_ERROR_INT APIC_LOCAL_INTS
#define APIC_THERMAL_INT (APIC_LOCAL_INTS + 1)
+#define APIC_CMC_INT (APIC_LOCAL_INTS + 2)
-#define APIC_IPI_INTS (APIC_LOCAL_INTS + 2)
+#define APIC_IPI_INTS (APIC_LOCAL_INTS + 3)
#define IPI_RENDEZVOUS (APIC_IPI_INTS) /* Inter-CPU rendezvous. */
#define IPI_INVLTLB (APIC_IPI_INTS + 1) /* TLB Shootdown IPIs */
#define IPI_INVLPG (APIC_IPI_INTS + 2)
@@ -142,7 +143,8 @@
#define LVT_ERROR 3
#define LVT_PMC 4
#define LVT_THERMAL 5
-#define LVT_MAX LVT_THERMAL
+#define LVT_CMCI 6
+#define LVT_MAX LVT_CMCI
#ifndef LOCORE
@@ -178,8 +180,8 @@ struct apic_enumerator {
inthand_t
IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3),
IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6),
- IDTVEC(apic_isr7), IDTVEC(errorint), IDTVEC(spuriousint),
- IDTVEC(timerint);
+ IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint),
+ IDTVEC(spuriousint), IDTVEC(timerint);
extern vm_paddr_t lapic_paddr;
extern int apic_cpuids[];
@@ -209,6 +211,7 @@ void lapic_create(u_int apic_id, int boot_cpu);
void lapic_disable(void);
void lapic_disable_pmc(void);
void lapic_dump(const char *str);
+void lapic_enable_cmc(void);
int lapic_enable_pmc(void);
void lapic_eoi(void);
int lapic_id(void);
@@ -217,6 +220,7 @@ int lapic_intr_pending(u_int vector);
void lapic_ipi_raw(register_t icrlo, u_int dest);
void lapic_ipi_vectored(u_int vector, int dest);
int lapic_ipi_wait(int delay);
+void lapic_handle_cmc(void);
void lapic_handle_error(void);
void lapic_handle_intr(int vector, struct trapframe *frame);
void lapic_handle_timer(struct trapframe *frame);
diff --git a/sys/amd64/include/mca.h b/sys/amd64/include/mca.h
index bc09480..951750f 100644
--- a/sys/amd64/include/mca.h
+++ b/sys/amd64/include/mca.h
@@ -46,6 +46,7 @@ struct mca_record {
#ifdef _KERNEL
+void cmc_intr(void);
void mca_init(void);
int mca_intr(void);
diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h
index 30f7a7b..a55627f 100644
--- a/sys/amd64/include/pcpu.h
+++ b/sys/amd64/include/pcpu.h
@@ -75,7 +75,8 @@
/* Pointer to the CPU LDT descriptor */ \
struct system_segment_descriptor *pc_ldt; \
/* Pointer to the CPU TSS descriptor */ \
- struct system_segment_descriptor *pc_tss \
+ struct system_segment_descriptor *pc_tss; \
+ u_int pc_cmci_mask /* MCx banks for CMCI */ \
PCPU_XEN_FIELDS
#ifdef _KERNEL
diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h
index 895619c..4e19d8e 100644
--- a/sys/amd64/include/specialreg.h
+++ b/sys/amd64/include/specialreg.h
@@ -385,7 +385,7 @@
#define MC_STATUS_VAL 0x8000000000000000
#define MC_MISC_RA_LSB 0x000000000000003f /* If MCG_CAP_SER_P */
#define MC_MISC_ADDRESS_MODE 0x00000000000001c0 /* If MCG_CAP_SER_P */
-#define MC_CTL2_THRESHOLD 0x0000000000003fff
+#define MC_CTL2_THRESHOLD 0x0000000000007fff
#define MC_CTL2_CMCI_EN 0x0000000040000000
/*
diff --git a/sys/i386/i386/apic_vector.s b/sys/i386/i386/apic_vector.s
index c4c5b01..e3000e1 100644
--- a/sys/i386/i386/apic_vector.s
+++ b/sys/i386/i386/apic_vector.s
@@ -111,6 +111,19 @@ IDTVEC(timerint)
jmp doreti
/*
+ * Local APIC CMCI handler.
+ */
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(cmcint)
+ PUSH_FRAME
+ SET_KERNEL_SREGS
+ FAKE_MCOUNT(TF_EIP(%esp))
+ call lapic_handle_cmc
+ MEXITCOUNT
+ jmp doreti
+
+/*
* Local APIC error interrupt handler.
*/
.text
diff --git a/sys/i386/i386/mca.c b/sys/i386/i386/mca.c
index 8d33b51..6ede87b 100644
--- a/sys/i386/i386/mca.c
+++ b/sys/i386/i386/mca.c
@@ -33,6 +33,8 @@
__FBSDID("$FreeBSD$");
#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
@@ -43,11 +45,29 @@ __FBSDID("$FreeBSD$");
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/taskqueue.h>
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
#include <machine/cputypes.h>
#include <machine/mca.h>
#include <machine/md_var.h>
#include <machine/specialreg.h>
+/* Modes for mca_scan() */
+enum scan_mode {
+ POLLED,
+ MCE,
+ CMCI,
+};
+
+/*
+ * State maintained for each monitored MCx bank to control the
+ * corrected machine check interrupt threshold.
+ */
+struct cmc_state {
+ int max_threshold;
+ int last_intr;
+};
+
struct mca_internal {
struct mca_record rec;
int logged;
@@ -79,19 +99,22 @@ static struct callout mca_timer;
static int mca_ticks = 3600; /* Check hourly by default. */
static struct task mca_task;
static struct mtx mca_lock;
+static struct cmc_state **cmc_state; /* Indexed by cpuid, bank */
+static int cmc_banks;
+static int cmc_throttle = 60; /* Time in seconds to throttle CMCI. */
static int
-sysctl_mca_ticks(SYSCTL_HANDLER_ARGS)
+sysctl_positive_int(SYSCTL_HANDLER_ARGS)
{
int error, value;
- value = mca_ticks;
+ value = *(int *)arg1;
error = sysctl_handle_int(oidp, &value, 0, req);
if (error || req->newptr == NULL)
return (error);
if (value <= 0)
return (EINVAL);
- mca_ticks = value;
+ *(int *)arg1 = value;
return (0);
}
@@ -401,31 +424,112 @@ mca_record_entry(const struct mca_record *record)
}
/*
+ * Update the interrupt threshold for a CMCI. The strategy is to use
+ * a low trigger that interrupts as soon as the first event occurs.
+ * However, if a steady stream of events arrive, the threshold is
+ * increased until the interrupts are throttled to once every
+ * cmc_throttle seconds or the periodic scan. If a periodic scan
+ * finds that the threshold is too high, it is lowered.
+ */
+static void
+cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
+{
+ struct cmc_state *cc;
+ uint64_t ctl;
+ u_int delta;
+ int count, limit;
+
+ /* Fetch the current limit for this bank. */
+ cc = &cmc_state[PCPU_GET(cpuid)][bank];
+ ctl = rdmsr(MSR_MC_CTL2(bank));
+ count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
+ delta = (u_int)(ticks - cc->last_intr);
+
+ /*
+ * If an interrupt was received less than cmc_throttle seconds
+ * since the previous interrupt and the count from the current
+ * event is greater than or equal to the current threshold,
+ * double the threshold up to the max.
+ */
+ if (mode == CMCI && valid) {
+ limit = ctl & MC_CTL2_THRESHOLD;
+ if (delta < cmc_throttle && count >= limit &&
+ limit < cc->max_threshold) {
+ limit = min(limit << 1, cc->max_threshold);
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= limit;
+ wrmsr(MSR_MC_CTL2(bank), limit);
+ }
+ cc->last_intr = ticks;
+ return;
+ }
+
+ /*
+ * When the banks are polled, check to see if the threshold
+ * should be lowered.
+ */
+ if (mode != POLLED)
+ return;
+
+ /* If a CMCI occured recently, do nothing for now. */
+ if (delta < cmc_throttle)
+ return;
+
+ /*
+ * Compute a new limit based on the average rate of events per
+ * cmc_throttle seconds since the last interrupt.
+ */
+ if (valid) {
+ count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
+ limit = count * cmc_throttle / delta;
+ if (limit <= 0)
+ limit = 1;
+ else if (limit > cc->max_threshold)
+ limit = cc->max_threshold;
+ } else
+ limit = 1;
+ if ((ctl & MC_CTL2_THRESHOLD) != limit) {
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= limit;
+ wrmsr(MSR_MC_CTL2(bank), limit);
+ }
+}
+
+/*
* This scans all the machine check banks of the current CPU to see if
* there are any machine checks. Any non-recoverable errors are
* reported immediately via mca_log(). The current thread must be
- * pinned when this is called. The 'mcip' parameter indicates if we
- * are being called from the MC exception handler. In that case this
- * function returns true if the system is restartable. Otherwise, it
- * returns a count of the number of valid MC records found.
+ * pinned when this is called. The 'mode' parameter indicates if we
+ * are being called from the MC exception handler, the CMCI handler,
+ * or the periodic poller. In the MC exception case this function
+ * returns true if the system is restartable. Otherwise, it returns a
+ * count of the number of valid MC records found.
*/
static int
-mca_scan(int mcip)
+mca_scan(enum scan_mode mode)
{
struct mca_record rec;
uint64_t mcg_cap, ucmask;
- int count, i, recoverable;
+ int count, i, recoverable, valid;
count = 0;
recoverable = 1;
ucmask = MC_STATUS_UC | MC_STATUS_PCC;
/* When handling a MCE#, treat the OVER flag as non-restartable. */
- if (mcip)
+ if (mode == MCE)
ucmask |= MC_STATUS_OVER;
mcg_cap = rdmsr(MSR_MCG_CAP);
for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
- if (mca_check_status(i, &rec)) {
+ /*
+ * For a CMCI, only check banks this CPU is
+ * responsible for.
+ */
+ if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
+ continue;
+
+ valid = mca_check_status(i, &rec);
+ if (valid) {
count++;
if (rec.mr_status & ucmask) {
recoverable = 0;
@@ -433,8 +537,15 @@ mca_scan(int mcip)
}
mca_record_entry(&rec);
}
+
+ /*
+ * If this is a bank this CPU monitors via CMCI,
+ * update the threshold.
+ */
+ if (PCPU_GET(cmci_mask) & (1 << i))
+ cmci_update(mode, i, valid, &rec);
}
- return (mcip ? recoverable : count);
+ return (mode == MCE ? recoverable : count);
}
/*
@@ -457,7 +568,7 @@ mca_scan_cpus(void *context, int pending)
continue;
sched_bind(td, cpu);
thread_unlock(td);
- count += mca_scan(0);
+ count += mca_scan(POLLED);
thread_lock(td);
sched_unbind(td);
}
@@ -511,7 +622,24 @@ mca_startup(void *dummy)
SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
static void
-mca_setup(void)
+cmci_setup(uint64_t mcg_cap)
+{
+ int i;
+
+ cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state **),
+ M_MCA, M_WAITOK);
+ cmc_banks = mcg_cap & MCG_CAP_COUNT;
+ for (i = 0; i <= mp_maxid; i++)
+ cmc_state[i] = malloc(sizeof(struct cmc_state) * cmc_banks,
+ M_MCA, M_WAITOK | M_ZERO);
+ SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
+ "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ &cmc_throttle, 0, sysctl_positive_int, "I",
+ "Interval in seconds to throttle corrected MC interrupts");
+}
+
+static void
+mca_setup(uint64_t mcg_cap)
{
mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
@@ -522,13 +650,62 @@ mca_setup(void)
"count", CTLFLAG_RD, &mca_count, 0, "Record count");
SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
"interval", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &mca_ticks,
- 0, sysctl_mca_ticks, "I",
+ 0, sysctl_positive_int, "I",
"Periodic interval in seconds to scan for machine checks");
SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
"records", CTLFLAG_RD, sysctl_mca_records, "Machine check records");
SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
"force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
+ if (mcg_cap & MCG_CAP_CMCI_P)
+ cmci_setup(mcg_cap);
+}
+
+/*
+ * See if we should monitor CMCI for this bank. If CMCI_EN is already
+ * set in MC_CTL2, then another CPU is responsible for this bank, so
+ * ignore it. If CMCI_EN returns zero after being set, then this bank
+ * does not support CMCI_EN. If this CPU sets CMCI_EN, then it should
+ * now monitor this bank.
+ */
+static void
+cmci_monitor(int i)
+{
+ struct cmc_state *cc;
+ uint64_t ctl;
+
+ KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
+
+ ctl = rdmsr(MSR_MC_CTL2(i));
+ if (ctl & MC_CTL2_CMCI_EN)
+ /* Already monitored by another CPU. */
+ return;
+
+ /* Set the threshold to one event for now. */
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= MC_CTL2_CMCI_EN | 1;
+ wrmsr(MSR_MC_CTL2(i), ctl);
+ ctl = rdmsr(MSR_MC_CTL2(i));
+ if (!(ctl & MC_CTL2_CMCI_EN))
+ /* This bank does not support CMCI. */
+ return;
+
+ cc = &cmc_state[PCPU_GET(cpuid)][i];
+
+ /* Determine maximum threshold. */
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= 0x7fff;
+ wrmsr(MSR_MC_CTL2(i), ctl);
+ ctl = rdmsr(MSR_MC_CTL2(i));
+ cc->max_threshold = ctl & MC_CTL2_THRESHOLD;
+
+ /* Start off with a threshold of 1. */
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= 1;
+ wrmsr(MSR_MC_CTL2(i), ctl);
+
+ /* Mark this bank as monitored. */
+ PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
}
/* Must be executed on each CPU. */
@@ -554,14 +731,14 @@ mca_init(void)
workaround_erratum383 = 1;
if (cpu_feature & CPUID_MCA) {
- if (PCPU_GET(cpuid) == 0)
- mca_setup();
+ PCPU_SET(cmci_mask, 0);
- sched_pin();
mcg_cap = rdmsr(MSR_MCG_CAP);
if (mcg_cap & MCG_CAP_CTL_P)
/* Enable MCA features. */
wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
+ if (PCPU_GET(cpuid) == 0)
+ mca_setup(mcg_cap);
/*
* Disable logging of level one TLB parity (L1TP) errors by
@@ -597,10 +774,16 @@ mca_init(void)
if (!skip)
wrmsr(MSR_MC_CTL(i), ctl);
+
+ if (mcg_cap & MCG_CAP_CMCI_P)
+ cmci_monitor(i);
+
/* Clear all errors. */
wrmsr(MSR_MC_STATUS(i), 0);
}
- sched_unpin();
+
+ if (PCPU_GET(cmci_mask) != 0)
+ lapic_enable_cmc();
}
load_cr4(rcr4() | CR4_MCE);
@@ -624,7 +807,7 @@ mca_intr(void)
}
/* Scan the banks and check for any non-recoverable errors. */
- recoverable = mca_scan(1);
+ recoverable = mca_scan(MCE);
mcg_status = rdmsr(MSR_MCG_STATUS);
if (!(mcg_status & MCG_STATUS_RIPV))
recoverable = 0;
@@ -633,3 +816,31 @@ mca_intr(void)
wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
return (recoverable);
}
+
+/* Called for a CMCI (correctable machine check interrupt). */
+void
+cmc_intr(void)
+{
+ struct mca_internal *mca;
+ int count;
+
+ /*
+ * Serialize MCA bank scanning to prevent collisions from
+ * sibling threads.
+ */
+ count = mca_scan(CMCI);
+
+ /* If we found anything, log them to the console. */
+ if (count != 0) {
+ mtx_lock_spin(&mca_lock);
+ STAILQ_FOREACH(mca, &mca_records, link) {
+ if (!mca->logged) {
+ mca->logged = 1;
+ mtx_unlock_spin(&mca_lock);
+ mca_log(&mca->rec);
+ mtx_lock_spin(&mca_lock);
+ }
+ }
+ mtx_unlock_spin(&mca_lock);
+ }
+}
diff --git a/sys/i386/include/apicreg.h b/sys/i386/include/apicreg.h
index 33f293b..fee629b 100644
--- a/sys/i386/include/apicreg.h
+++ b/sys/i386/include/apicreg.h
@@ -89,7 +89,7 @@
* 2C0 Reserved
* 2D0 Reserved
* 2E0 Reserved
- * 2F0 Reserved
+ * 2F0 Local Vector Table (CMCI) R/W
* 300 ICR_LOW Interrupt Command Reg. (0-31) R/W
* 310 ICR_HI Interrupt Command Reg. (32-63) R/W
* 320 Local Vector Table (Timer) R/W
@@ -172,7 +172,7 @@ struct LAPIC {
/* reserved */ PAD4;
/* reserved */ PAD4;
/* reserved */ PAD4;
- /* reserved */ PAD4;
+ u_int32_t lvt_cmci; PAD3;
u_int32_t icr_lo; PAD3;
u_int32_t icr_hi; PAD3;
u_int32_t lvt_timer; PAD3;
diff --git a/sys/i386/include/apicvar.h b/sys/i386/include/apicvar.h
index adfc1e8..f917357 100644
--- a/sys/i386/include/apicvar.h
+++ b/sys/i386/include/apicvar.h
@@ -108,7 +108,8 @@
#define APIC_LOCAL_INTS 240
#define APIC_ERROR_INT APIC_LOCAL_INTS
#define APIC_THERMAL_INT (APIC_LOCAL_INTS + 1)
-#define APIC_IPI_INTS (APIC_LOCAL_INTS + 2)
+#define APIC_CMC_INT (APIC_LOCAL_INTS + 2)
+#define APIC_IPI_INTS (APIC_LOCAL_INTS + 3)
#define IPI_RENDEZVOUS (APIC_IPI_INTS) /* Inter-CPU rendezvous. */
#define IPI_INVLTLB (APIC_IPI_INTS + 1) /* TLB Shootdown IPIs */
@@ -135,7 +136,8 @@
#define APIC_LOCAL_INTS 240
#define APIC_ERROR_INT APIC_LOCAL_INTS
#define APIC_THERMAL_INT (APIC_LOCAL_INTS + 1)
-#define APIC_IPI_INTS (APIC_LOCAL_INTS + 2)
+#define APIC_CMC_INT (APIC_LOCAL_INTS + 2)
+#define APIC_IPI_INTS (APIC_LOCAL_INTS + 3)
#define IPI_RENDEZVOUS (APIC_IPI_INTS) /* Inter-CPU rendezvous. */
#define IPI_INVLTLB (APIC_IPI_INTS + 1) /* TLB Shootdown IPIs */
@@ -170,7 +172,8 @@
#define LVT_ERROR 3
#define LVT_PMC 4
#define LVT_THERMAL 5
-#define LVT_MAX LVT_THERMAL
+#define LVT_CMCI 6
+#define LVT_MAX LVT_CMCI
#ifndef LOCORE
@@ -206,8 +209,8 @@ struct apic_enumerator {
inthand_t
IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3),
IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6),
- IDTVEC(apic_isr7), IDTVEC(errorint), IDTVEC(spuriousint),
- IDTVEC(timerint);
+ IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint),
+ IDTVEC(spuriousint), IDTVEC(timerint);
extern vm_paddr_t lapic_paddr;
extern int apic_cpuids[];
@@ -237,6 +240,7 @@ void lapic_create(u_int apic_id, int boot_cpu);
void lapic_disable(void);
void lapic_disable_pmc(void);
void lapic_dump(const char *str);
+void lapic_enable_cmc(void);
int lapic_enable_pmc(void);
void lapic_eoi(void);
int lapic_id(void);
@@ -245,6 +249,7 @@ int lapic_intr_pending(u_int vector);
void lapic_ipi_raw(register_t icrlo, u_int dest);
void lapic_ipi_vectored(u_int vector, int dest);
int lapic_ipi_wait(int delay);
+void lapic_handle_cmc(void);
void lapic_handle_error(void);
void lapic_handle_intr(int vector, struct trapframe *frame);
void lapic_handle_timer(struct trapframe *frame);
diff --git a/sys/i386/include/mca.h b/sys/i386/include/mca.h
index bc09480..951750f 100644
--- a/sys/i386/include/mca.h
+++ b/sys/i386/include/mca.h
@@ -46,6 +46,7 @@ struct mca_record {
#ifdef _KERNEL
+void cmc_intr(void);
void mca_init(void);
int mca_intr(void);
diff --git a/sys/i386/include/pcpu.h b/sys/i386/include/pcpu.h
index 6cc03d0..5345eb6 100644
--- a/sys/i386/include/pcpu.h
+++ b/sys/i386/include/pcpu.h
@@ -76,6 +76,7 @@ struct shadow_time_info {
u_int pc_acpi_id; /* ACPI CPU id */ \
u_int pc_apic_id; \
int pc_private_tss; /* Flag indicating private tss*/\
+ u_int pc_cmci_mask; /* MCx banks for CMCI */ \
u_int pc_cr3; /* track cr3 for R1/R3*/ \
u_int pc_pdir; \
u_int pc_lazypmap; \
@@ -102,7 +103,8 @@ struct shadow_time_info {
int pc_currentldt; \
u_int pc_acpi_id; /* ACPI CPU id */ \
u_int pc_apic_id; \
- int pc_private_tss /* Flag indicating private tss */
+ int pc_private_tss; /* Flag indicating private tss*/\
+ u_int pc_cmci_mask /* MCx banks for CMCI */ \
#endif
diff --git a/sys/i386/include/specialreg.h b/sys/i386/include/specialreg.h
index efcf924..b550574 100644
--- a/sys/i386/include/specialreg.h
+++ b/sys/i386/include/specialreg.h
@@ -454,7 +454,7 @@
#define MC_STATUS_VAL 0x8000000000000000
#define MC_MISC_RA_LSB 0x000000000000003f /* If MCG_CAP_SER_P */
#define MC_MISC_ADDRESS_MODE 0x00000000000001c0 /* If MCG_CAP_SER_P */
-#define MC_CTL2_THRESHOLD 0x0000000000003fff
+#define MC_CTL2_THRESHOLD 0x0000000000007fff
#define MC_CTL2_CMCI_EN 0x0000000040000000
/*
diff --git a/sys/x86/x86/local_apic.c b/sys/x86/x86/local_apic.c
index f40026b..7fec4f6 100644
--- a/sys/x86/x86/local_apic.c
+++ b/sys/x86/x86/local_apic.c
@@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$");
#include <machine/frame.h>
#include <machine/intr_machdep.h>
#include <machine/apicvar.h>
+#include <machine/mca.h>
#include <machine/md_var.h>
#include <machine/smp.h>
#include <machine/specialreg.h>
@@ -130,6 +131,7 @@ static struct lvt lvts[LVT_MAX + 1] = {
{ 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT }, /* Error */
{ 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 }, /* PMC */
{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT }, /* Thermal */
+ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT }, /* CMCI */
};
static inthand_t *ioint_handlers[] = {
@@ -235,6 +237,9 @@ lapic_init(vm_paddr_t addr)
setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC);
/* XXX: Thermal interrupt */
+
+ /* Local APIC CMCI. */
+ setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_APICT, SEL_KPL, GSEL_APIC);
}
/*
@@ -260,7 +265,7 @@ lapic_create(u_int apic_id, int boot_cpu)
*/
lapics[apic_id].la_present = 1;
lapics[apic_id].la_id = apic_id;
- for (i = 0; i < LVT_MAX; i++) {
+ for (i = 0; i <= LVT_MAX; i++) {
lapics[apic_id].la_lvts[i] = lvts[i];
lapics[apic_id].la_lvts[i].lvt_active = 0;
}
@@ -290,6 +295,7 @@ lapic_dump(const char* str)
printf(" timer: 0x%08x therm: 0x%08x err: 0x%08x pmc: 0x%08x\n",
lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error,
lapic->lvt_pcint);
+ printf(" cmci: 0x%08x\n", lapic->lvt_cmci);
}
void
@@ -341,6 +347,10 @@ lapic_setup(int boot)
/* XXX: Thermal LVT */
+ /* Program the CMCI LVT entry if present. */
+ if (maxlvt >= LVT_CMCI)
+ lapic->lvt_cmci = lvt_mode(la, LVT_CMCI, lapic->lvt_cmci);
+
intr_restore(eflags);
}
@@ -838,6 +848,34 @@ lapic_timer_enable_intr(void)
}
void
+lapic_handle_cmc(void)
+{
+
+ lapic_eoi();
+ cmc_intr();
+}
+
+/*
+ * Called from the mca_init() to activate the CMC interrupt if this CPU is
+ * responsible for monitoring any MC banks for CMC events. Since mca_init()
+ * is called prior to lapic_setup() during boot, this just needs to unmask
+ * this CPU's LVT_CMCI entry.
+ */
+void
+lapic_enable_cmc(void)
+{
+ u_int apic_id;
+
+ apic_id = PCPU_GET(apic_id);
+ KASSERT(lapics[apic_id].la_present,
+ ("%s: missing APIC %u", __func__, apic_id));
+ lapics[apic_id].la_lvts[LVT_CMCI].lvt_masked = 0;
+ lapics[apic_id].la_lvts[LVT_CMCI].lvt_active = 1;
+ if (bootverbose)
+ printf("lapic%u: CMCI unmasked\n", apic_id);
+}
+
+void
lapic_handle_error(void)
{
u_int32_t esr;
OpenPOWER on IntegriCloud