summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorjhb <jhb@FreeBSD.org>2010-05-24 15:45:05 +0000
committerjhb <jhb@FreeBSD.org>2010-05-24 15:45:05 +0000
commit9e6f9b1e86420cd7da13f1d568950ab52ff2428f (patch)
treea9d317cc94e24d55cd44eed2052b78547f4a46f9
parentce59c74efda35e47b39621f57c0fe63e02c7af5b (diff)
downloadFreeBSD-src-9e6f9b1e86420cd7da13f1d568950ab52ff2428f.zip
FreeBSD-src-9e6f9b1e86420cd7da13f1d568950ab52ff2428f.tar.gz
Add support for corrected machine check interrupts. CMCI is a new local
APIC interrupt that fires when a threshold of corrected machine check events is reached. CMCI also includes a count of events when reporting corrected errors in the bank's status register. Note that individual banks may or may not support CMCI. If they do, each bank includes its own threshold register that determines when the interrupt fires. Currently the code uses a very simple strategy where it doubles the threshold on each interrupt until it succeeds in throttling the interrupt to occur only once a minute (this interval can be tuned via sysctl). The threshold is also adjusted on each hourly poll which will lower the threshold once events stop occurring. Tested by: Sailaja Bangaru sbappana at yahoo com MFC after: 1 month
-rw-r--r--sys/amd64/amd64/apic_vector.S12
-rw-r--r--sys/amd64/amd64/mca.c251
-rw-r--r--sys/amd64/include/apicreg.h4
-rw-r--r--sys/amd64/include/apicvar.h12
-rw-r--r--sys/amd64/include/mca.h1
-rw-r--r--sys/amd64/include/pcpu.h3
-rw-r--r--sys/amd64/include/specialreg.h2
-rw-r--r--sys/i386/i386/apic_vector.s13
-rw-r--r--sys/i386/i386/mca.c251
-rw-r--r--sys/i386/include/apicreg.h4
-rw-r--r--sys/i386/include/apicvar.h15
-rw-r--r--sys/i386/include/mca.h1
-rw-r--r--sys/i386/include/pcpu.h4
-rw-r--r--sys/i386/include/specialreg.h2
-rw-r--r--sys/x86/x86/local_apic.c40
15 files changed, 557 insertions, 58 deletions
diff --git a/sys/amd64/amd64/apic_vector.S b/sys/amd64/amd64/apic_vector.S
index 4cfc18b..1c044b8 100644
--- a/sys/amd64/amd64/apic_vector.S
+++ b/sys/amd64/amd64/apic_vector.S
@@ -105,6 +105,18 @@ IDTVEC(timerint)
jmp doreti
/*
+ * Local APIC CMCI handler.
+ */
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(cmcint)
+ PUSH_FRAME
+ FAKE_MCOUNT(TF_RIP(%rsp))
+ call lapic_handle_cmc
+ MEXITCOUNT
+ jmp doreti
+
+/*
* Local APIC error interrupt handler.
*/
.text
diff --git a/sys/amd64/amd64/mca.c b/sys/amd64/amd64/mca.c
index ccbab17..f3b7e9e 100644
--- a/sys/amd64/amd64/mca.c
+++ b/sys/amd64/amd64/mca.c
@@ -33,6 +33,8 @@
__FBSDID("$FreeBSD$");
#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
@@ -43,11 +45,29 @@ __FBSDID("$FreeBSD$");
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/taskqueue.h>
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
#include <machine/cputypes.h>
#include <machine/mca.h>
#include <machine/md_var.h>
#include <machine/specialreg.h>
+/* Modes for mca_scan() */
+enum scan_mode {
+ POLLED,
+ MCE,
+ CMCI,
+};
+
+/*
+ * State maintained for each monitored MCx bank to control the
+ * corrected machine check interrupt threshold.
+ */
+struct cmc_state {
+ int max_threshold;
+ int last_intr;
+};
+
struct mca_internal {
struct mca_record rec;
int logged;
@@ -79,19 +99,22 @@ static struct callout mca_timer;
static int mca_ticks = 3600; /* Check hourly by default. */
static struct task mca_task;
static struct mtx mca_lock;
+static struct cmc_state **cmc_state; /* Indexed by cpuid, bank */
+static int cmc_banks;
+static int cmc_throttle = 60; /* Time in seconds to throttle CMCI. */
static int
-sysctl_mca_ticks(SYSCTL_HANDLER_ARGS)
+sysctl_positive_int(SYSCTL_HANDLER_ARGS)
{
int error, value;
- value = mca_ticks;
+ value = *(int *)arg1;
error = sysctl_handle_int(oidp, &value, 0, req);
if (error || req->newptr == NULL)
return (error);
if (value <= 0)
return (EINVAL);
- mca_ticks = value;
+ *(int *)arg1 = value;
return (0);
}
@@ -401,31 +424,112 @@ mca_record_entry(const struct mca_record *record)
}
/*
+ * Update the interrupt threshold for a CMCI. The strategy is to use
+ * a low trigger that interrupts as soon as the first event occurs.
+ * However, if a steady stream of events arrive, the threshold is
+ * increased until the interrupts are throttled to once every
+ * cmc_throttle seconds or the periodic scan. If a periodic scan
+ * finds that the threshold is too high, it is lowered.
+ */
+static void
+cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
+{
+ struct cmc_state *cc;
+ uint64_t ctl;
+ u_int delta;
+ int count, limit;
+
+ /* Fetch the current limit for this bank. */
+ cc = &cmc_state[PCPU_GET(cpuid)][bank];
+ ctl = rdmsr(MSR_MC_CTL2(bank));
+ count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
+ delta = (u_int)(ticks - cc->last_intr);
+
+ /*
+ * If an interrupt was received less than cmc_throttle seconds
+ * since the previous interrupt and the count from the current
+ * event is greater than or equal to the current threshold,
+ * double the threshold up to the max.
+ */
+ if (mode == CMCI && valid) {
+ limit = ctl & MC_CTL2_THRESHOLD;
+ if (delta < cmc_throttle && count >= limit &&
+ limit < cc->max_threshold) {
+ limit = min(limit << 1, cc->max_threshold);
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= limit;
+ wrmsr(MSR_MC_CTL2(bank), limit);
+ }
+ cc->last_intr = ticks;
+ return;
+ }
+
+ /*
+ * When the banks are polled, check to see if the threshold
+ * should be lowered.
+ */
+ if (mode != POLLED)
+ return;
+
+ /* If a CMCI occured recently, do nothing for now. */
+ if (delta < cmc_throttle)
+ return;
+
+ /*
+ * Compute a new limit based on the average rate of events per
+ * cmc_throttle seconds since the last interrupt.
+ */
+ if (valid) {
+ count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
+ limit = count * cmc_throttle / delta;
+ if (limit <= 0)
+ limit = 1;
+ else if (limit > cc->max_threshold)
+ limit = cc->max_threshold;
+ } else
+ limit = 1;
+ if ((ctl & MC_CTL2_THRESHOLD) != limit) {
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= limit;
+ wrmsr(MSR_MC_CTL2(bank), limit);
+ }
+}
+
+/*
* This scans all the machine check banks of the current CPU to see if
* there are any machine checks. Any non-recoverable errors are
* reported immediately via mca_log(). The current thread must be
- * pinned when this is called. The 'mcip' parameter indicates if we
- * are being called from the MC exception handler. In that case this
- * function returns true if the system is restartable. Otherwise, it
- * returns a count of the number of valid MC records found.
+ * pinned when this is called. The 'mode' parameter indicates if we
+ * are being called from the MC exception handler, the CMCI handler,
+ * or the periodic poller. In the MC exception case this function
+ * returns true if the system is restartable. Otherwise, it returns a
+ * count of the number of valid MC records found.
*/
static int
-mca_scan(int mcip)
+mca_scan(enum scan_mode mode)
{
struct mca_record rec;
uint64_t mcg_cap, ucmask;
- int count, i, recoverable;
+ int count, i, recoverable, valid;
count = 0;
recoverable = 1;
ucmask = MC_STATUS_UC | MC_STATUS_PCC;
/* When handling a MCE#, treat the OVER flag as non-restartable. */
- if (mcip)
+ if (mode == MCE)
ucmask |= MC_STATUS_OVER;
mcg_cap = rdmsr(MSR_MCG_CAP);
for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
- if (mca_check_status(i, &rec)) {
+ /*
+ * For a CMCI, only check banks this CPU is
+ * responsible for.
+ */
+ if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
+ continue;
+
+ valid = mca_check_status(i, &rec);
+ if (valid) {
count++;
if (rec.mr_status & ucmask) {
recoverable = 0;
@@ -433,8 +537,15 @@ mca_scan(int mcip)
}
mca_record_entry(&rec);
}
+
+ /*
+ * If this is a bank this CPU monitors via CMCI,
+ * update the threshold.
+ */
+ if (PCPU_GET(cmci_mask) & (1 << i))
+ cmci_update(mode, i, valid, &rec);
}
- return (mcip ? recoverable : count);
+ return (mode == MCE ? recoverable : count);
}
/*
@@ -457,7 +568,7 @@ mca_scan_cpus(void *context, int pending)
continue;
sched_bind(td, cpu);
thread_unlock(td);
- count += mca_scan(0);
+ count += mca_scan(POLLED);
thread_lock(td);
sched_unbind(td);
}
@@ -511,7 +622,24 @@ mca_startup(void *dummy)
SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
static void
-mca_setup(void)
+cmci_setup(uint64_t mcg_cap)
+{
+ int i;
+
+ cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state **),
+ M_MCA, M_WAITOK);
+ cmc_banks = mcg_cap & MCG_CAP_COUNT;
+ for (i = 0; i <= mp_maxid; i++)
+ cmc_state[i] = malloc(sizeof(struct cmc_state) * cmc_banks,
+ M_MCA, M_WAITOK | M_ZERO);
+ SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
+ "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ &cmc_throttle, 0, sysctl_positive_int, "I",
+ "Interval in seconds to throttle corrected MC interrupts");
+}
+
+static void
+mca_setup(uint64_t mcg_cap)
{
mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
@@ -522,13 +650,62 @@ mca_setup(void)
"count", CTLFLAG_RD, &mca_count, 0, "Record count");
SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
"interval", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &mca_ticks,
- 0, sysctl_mca_ticks, "I",
+ 0, sysctl_positive_int, "I",
"Periodic interval in seconds to scan for machine checks");
SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
"records", CTLFLAG_RD, sysctl_mca_records, "Machine check records");
SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
"force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
+ if (mcg_cap & MCG_CAP_CMCI_P)
+ cmci_setup(mcg_cap);
+}
+
+/*
+ * See if we should monitor CMCI for this bank. If CMCI_EN is already
+ * set in MC_CTL2, then another CPU is responsible for this bank, so
+ * ignore it. If CMCI_EN returns zero after being set, then this bank
+ * does not support CMCI_EN. If this CPU sets CMCI_EN, then it should
+ * now monitor this bank.
+ */
+static void
+cmci_monitor(int i)
+{
+ struct cmc_state *cc;
+ uint64_t ctl;
+
+ KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
+
+ ctl = rdmsr(MSR_MC_CTL2(i));
+ if (ctl & MC_CTL2_CMCI_EN)
+ /* Already monitored by another CPU. */
+ return;
+
+ /* Set the threshold to one event for now. */
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= MC_CTL2_CMCI_EN | 1;
+ wrmsr(MSR_MC_CTL2(i), ctl);
+ ctl = rdmsr(MSR_MC_CTL2(i));
+ if (!(ctl & MC_CTL2_CMCI_EN))
+ /* This bank does not support CMCI. */
+ return;
+
+ cc = &cmc_state[PCPU_GET(cpuid)][i];
+
+ /* Determine maximum threshold. */
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= 0x7fff;
+ wrmsr(MSR_MC_CTL2(i), ctl);
+ ctl = rdmsr(MSR_MC_CTL2(i));
+ cc->max_threshold = ctl & MC_CTL2_THRESHOLD;
+
+ /* Start off with a threshold of 1. */
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= 1;
+ wrmsr(MSR_MC_CTL2(i), ctl);
+
+ /* Mark this bank as monitored. */
+ PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
}
/* Must be executed on each CPU. */
@@ -554,14 +731,14 @@ mca_init(void)
workaround_erratum383 = 1;
if (cpu_feature & CPUID_MCA) {
- if (PCPU_GET(cpuid) == 0)
- mca_setup();
+ PCPU_SET(cmci_mask, 0);
- sched_pin();
mcg_cap = rdmsr(MSR_MCG_CAP);
if (mcg_cap & MCG_CAP_CTL_P)
/* Enable MCA features. */
wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
+ if (PCPU_GET(cpuid) == 0)
+ mca_setup(mcg_cap);
/*
* Disable logging of level one TLB parity (L1TP) errors by
@@ -597,10 +774,16 @@ mca_init(void)
if (!skip)
wrmsr(MSR_MC_CTL(i), ctl);
+
+ if (mcg_cap & MCG_CAP_CMCI_P)
+ cmci_monitor(i);
+
/* Clear all errors. */
wrmsr(MSR_MC_STATUS(i), 0);
}
- sched_unpin();
+
+ if (PCPU_GET(cmci_mask) != 0)
+ lapic_enable_cmc();
}
load_cr4(rcr4() | CR4_MCE);
@@ -624,7 +807,7 @@ mca_intr(void)
}
/* Scan the banks and check for any non-recoverable errors. */
- recoverable = mca_scan(1);
+ recoverable = mca_scan(MCE);
mcg_status = rdmsr(MSR_MCG_STATUS);
if (!(mcg_status & MCG_STATUS_RIPV))
recoverable = 0;
@@ -633,3 +816,31 @@ mca_intr(void)
wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
return (recoverable);
}
+
+/* Called for a CMCI (correctable machine check interrupt). */
+void
+cmc_intr(void)
+{
+ struct mca_internal *mca;
+ int count;
+
+ /*
+ * Serialize MCA bank scanning to prevent collisions from
+ * sibling threads.
+ */
+ count = mca_scan(CMCI);
+
+ /* If we found anything, log them to the console. */
+ if (count != 0) {
+ mtx_lock_spin(&mca_lock);
+ STAILQ_FOREACH(mca, &mca_records, link) {
+ if (!mca->logged) {
+ mca->logged = 1;
+ mtx_unlock_spin(&mca_lock);
+ mca_log(&mca->rec);
+ mtx_lock_spin(&mca_lock);
+ }
+ }
+ mtx_unlock_spin(&mca_lock);
+ }
+}
diff --git a/sys/amd64/include/apicreg.h b/sys/amd64/include/apicreg.h
index 33f293b..fee629b 100644
--- a/sys/amd64/include/apicreg.h
+++ b/sys/amd64/include/apicreg.h
@@ -89,7 +89,7 @@
* 2C0 Reserved
* 2D0 Reserved
* 2E0 Reserved
- * 2F0 Reserved
+ * 2F0 Local Vector Table (CMCI) R/W
* 300 ICR_LOW Interrupt Command Reg. (0-31) R/W
* 310 ICR_HI Interrupt Command Reg. (32-63) R/W
* 320 Local Vector Table (Timer) R/W
@@ -172,7 +172,7 @@ struct LAPIC {
/* reserved */ PAD4;
/* reserved */ PAD4;
/* reserved */ PAD4;
- /* reserved */ PAD4;
+ u_int32_t lvt_cmci; PAD3;
u_int32_t icr_lo; PAD3;
u_int32_t icr_hi; PAD3;
u_int32_t lvt_timer; PAD3;
diff --git a/sys/amd64/include/apicvar.h b/sys/amd64/include/apicvar.h
index 8dc04a9..4968842 100644
--- a/sys/amd64/include/apicvar.h
+++ b/sys/amd64/include/apicvar.h
@@ -108,8 +108,9 @@
#define APIC_LOCAL_INTS 240
#define APIC_ERROR_INT APIC_LOCAL_INTS
#define APIC_THERMAL_INT (APIC_LOCAL_INTS + 1)
+#define APIC_CMC_INT (APIC_LOCAL_INTS + 2)
-#define APIC_IPI_INTS (APIC_LOCAL_INTS + 2)
+#define APIC_IPI_INTS (APIC_LOCAL_INTS + 3)
#define IPI_RENDEZVOUS (APIC_IPI_INTS) /* Inter-CPU rendezvous. */
#define IPI_INVLTLB (APIC_IPI_INTS + 1) /* TLB Shootdown IPIs */
#define IPI_INVLPG (APIC_IPI_INTS + 2)
@@ -142,7 +143,8 @@
#define LVT_ERROR 3
#define LVT_PMC 4
#define LVT_THERMAL 5
-#define LVT_MAX LVT_THERMAL
+#define LVT_CMCI 6
+#define LVT_MAX LVT_CMCI
#ifndef LOCORE
@@ -178,8 +180,8 @@ struct apic_enumerator {
inthand_t
IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3),
IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6),
- IDTVEC(apic_isr7), IDTVEC(errorint), IDTVEC(spuriousint),
- IDTVEC(timerint);
+ IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint),
+ IDTVEC(spuriousint), IDTVEC(timerint);
extern vm_paddr_t lapic_paddr;
extern int apic_cpuids[];
@@ -209,6 +211,7 @@ void lapic_create(u_int apic_id, int boot_cpu);
void lapic_disable(void);
void lapic_disable_pmc(void);
void lapic_dump(const char *str);
+void lapic_enable_cmc(void);
int lapic_enable_pmc(void);
void lapic_eoi(void);
int lapic_id(void);
@@ -217,6 +220,7 @@ int lapic_intr_pending(u_int vector);
void lapic_ipi_raw(register_t icrlo, u_int dest);
void lapic_ipi_vectored(u_int vector, int dest);
int lapic_ipi_wait(int delay);
+void lapic_handle_cmc(void);
void lapic_handle_error(void);
void lapic_handle_intr(int vector, struct trapframe *frame);
void lapic_handle_timer(struct trapframe *frame);
diff --git a/sys/amd64/include/mca.h b/sys/amd64/include/mca.h
index bc09480..951750f 100644
--- a/sys/amd64/include/mca.h
+++ b/sys/amd64/include/mca.h
@@ -46,6 +46,7 @@ struct mca_record {
#ifdef _KERNEL
+void cmc_intr(void);
void mca_init(void);
int mca_intr(void);
diff --git a/sys/amd64/include/pcpu.h b/sys/amd64/include/pcpu.h
index 30f7a7b..a55627f 100644
--- a/sys/amd64/include/pcpu.h
+++ b/sys/amd64/include/pcpu.h
@@ -75,7 +75,8 @@
/* Pointer to the CPU LDT descriptor */ \
struct system_segment_descriptor *pc_ldt; \
/* Pointer to the CPU TSS descriptor */ \
- struct system_segment_descriptor *pc_tss \
+ struct system_segment_descriptor *pc_tss; \
+ u_int pc_cmci_mask /* MCx banks for CMCI */ \
PCPU_XEN_FIELDS
#ifdef _KERNEL
diff --git a/sys/amd64/include/specialreg.h b/sys/amd64/include/specialreg.h
index 895619c..4e19d8e 100644
--- a/sys/amd64/include/specialreg.h
+++ b/sys/amd64/include/specialreg.h
@@ -385,7 +385,7 @@
#define MC_STATUS_VAL 0x8000000000000000
#define MC_MISC_RA_LSB 0x000000000000003f /* If MCG_CAP_SER_P */
#define MC_MISC_ADDRESS_MODE 0x00000000000001c0 /* If MCG_CAP_SER_P */
-#define MC_CTL2_THRESHOLD 0x0000000000003fff
+#define MC_CTL2_THRESHOLD 0x0000000000007fff
#define MC_CTL2_CMCI_EN 0x0000000040000000
/*
diff --git a/sys/i386/i386/apic_vector.s b/sys/i386/i386/apic_vector.s
index c4c5b01..e3000e1 100644
--- a/sys/i386/i386/apic_vector.s
+++ b/sys/i386/i386/apic_vector.s
@@ -111,6 +111,19 @@ IDTVEC(timerint)
jmp doreti
/*
+ * Local APIC CMCI handler.
+ */
+ .text
+ SUPERALIGN_TEXT
+IDTVEC(cmcint)
+ PUSH_FRAME
+ SET_KERNEL_SREGS
+ FAKE_MCOUNT(TF_EIP(%esp))
+ call lapic_handle_cmc
+ MEXITCOUNT
+ jmp doreti
+
+/*
* Local APIC error interrupt handler.
*/
.text
diff --git a/sys/i386/i386/mca.c b/sys/i386/i386/mca.c
index 8d33b51..6ede87b 100644
--- a/sys/i386/i386/mca.c
+++ b/sys/i386/i386/mca.c
@@ -33,6 +33,8 @@
__FBSDID("$FreeBSD$");
#include <sys/param.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
#include <sys/kernel.h>
#include <sys/lock.h>
#include <sys/malloc.h>
@@ -43,11 +45,29 @@ __FBSDID("$FreeBSD$");
#include <sys/sysctl.h>
#include <sys/systm.h>
#include <sys/taskqueue.h>
+#include <machine/intr_machdep.h>
+#include <machine/apicvar.h>
#include <machine/cputypes.h>
#include <machine/mca.h>
#include <machine/md_var.h>
#include <machine/specialreg.h>
+/* Modes for mca_scan() */
+enum scan_mode {
+ POLLED,
+ MCE,
+ CMCI,
+};
+
+/*
+ * State maintained for each monitored MCx bank to control the
+ * corrected machine check interrupt threshold.
+ */
+struct cmc_state {
+ int max_threshold;
+ int last_intr;
+};
+
struct mca_internal {
struct mca_record rec;
int logged;
@@ -79,19 +99,22 @@ static struct callout mca_timer;
static int mca_ticks = 3600; /* Check hourly by default. */
static struct task mca_task;
static struct mtx mca_lock;
+static struct cmc_state **cmc_state; /* Indexed by cpuid, bank */
+static int cmc_banks;
+static int cmc_throttle = 60; /* Time in seconds to throttle CMCI. */
static int
-sysctl_mca_ticks(SYSCTL_HANDLER_ARGS)
+sysctl_positive_int(SYSCTL_HANDLER_ARGS)
{
int error, value;
- value = mca_ticks;
+ value = *(int *)arg1;
error = sysctl_handle_int(oidp, &value, 0, req);
if (error || req->newptr == NULL)
return (error);
if (value <= 0)
return (EINVAL);
- mca_ticks = value;
+ *(int *)arg1 = value;
return (0);
}
@@ -401,31 +424,112 @@ mca_record_entry(const struct mca_record *record)
}
/*
+ * Update the interrupt threshold for a CMCI. The strategy is to use
+ * a low trigger that interrupts as soon as the first event occurs.
+ * However, if a steady stream of events arrive, the threshold is
+ * increased until the interrupts are throttled to once every
+ * cmc_throttle seconds or the periodic scan. If a periodic scan
+ * finds that the threshold is too high, it is lowered.
+ */
+static void
+cmci_update(enum scan_mode mode, int bank, int valid, struct mca_record *rec)
+{
+ struct cmc_state *cc;
+ uint64_t ctl;
+ u_int delta;
+ int count, limit;
+
+ /* Fetch the current limit for this bank. */
+ cc = &cmc_state[PCPU_GET(cpuid)][bank];
+ ctl = rdmsr(MSR_MC_CTL2(bank));
+ count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
+ delta = (u_int)(ticks - cc->last_intr);
+
+ /*
+ * If an interrupt was received less than cmc_throttle seconds
+ * since the previous interrupt and the count from the current
+ * event is greater than or equal to the current threshold,
+ * double the threshold up to the max.
+ */
+ if (mode == CMCI && valid) {
+ limit = ctl & MC_CTL2_THRESHOLD;
+ if (delta < cmc_throttle && count >= limit &&
+ limit < cc->max_threshold) {
+ limit = min(limit << 1, cc->max_threshold);
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= limit;
+ wrmsr(MSR_MC_CTL2(bank), limit);
+ }
+ cc->last_intr = ticks;
+ return;
+ }
+
+ /*
+ * When the banks are polled, check to see if the threshold
+ * should be lowered.
+ */
+ if (mode != POLLED)
+ return;
+
+ /* If a CMCI occured recently, do nothing for now. */
+ if (delta < cmc_throttle)
+ return;
+
+ /*
+ * Compute a new limit based on the average rate of events per
+ * cmc_throttle seconds since the last interrupt.
+ */
+ if (valid) {
+ count = (rec->mr_status & MC_STATUS_COR_COUNT) >> 38;
+ limit = count * cmc_throttle / delta;
+ if (limit <= 0)
+ limit = 1;
+ else if (limit > cc->max_threshold)
+ limit = cc->max_threshold;
+ } else
+ limit = 1;
+ if ((ctl & MC_CTL2_THRESHOLD) != limit) {
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= limit;
+ wrmsr(MSR_MC_CTL2(bank), limit);
+ }
+}
+
+/*
* This scans all the machine check banks of the current CPU to see if
* there are any machine checks. Any non-recoverable errors are
* reported immediately via mca_log(). The current thread must be
- * pinned when this is called. The 'mcip' parameter indicates if we
- * are being called from the MC exception handler. In that case this
- * function returns true if the system is restartable. Otherwise, it
- * returns a count of the number of valid MC records found.
+ * pinned when this is called. The 'mode' parameter indicates if we
+ * are being called from the MC exception handler, the CMCI handler,
+ * or the periodic poller. In the MC exception case this function
+ * returns true if the system is restartable. Otherwise, it returns a
+ * count of the number of valid MC records found.
*/
static int
-mca_scan(int mcip)
+mca_scan(enum scan_mode mode)
{
struct mca_record rec;
uint64_t mcg_cap, ucmask;
- int count, i, recoverable;
+ int count, i, recoverable, valid;
count = 0;
recoverable = 1;
ucmask = MC_STATUS_UC | MC_STATUS_PCC;
/* When handling a MCE#, treat the OVER flag as non-restartable. */
- if (mcip)
+ if (mode == MCE)
ucmask |= MC_STATUS_OVER;
mcg_cap = rdmsr(MSR_MCG_CAP);
for (i = 0; i < (mcg_cap & MCG_CAP_COUNT); i++) {
- if (mca_check_status(i, &rec)) {
+ /*
+ * For a CMCI, only check banks this CPU is
+ * responsible for.
+ */
+ if (mode == CMCI && !(PCPU_GET(cmci_mask) & 1 << i))
+ continue;
+
+ valid = mca_check_status(i, &rec);
+ if (valid) {
count++;
if (rec.mr_status & ucmask) {
recoverable = 0;
@@ -433,8 +537,15 @@ mca_scan(int mcip)
}
mca_record_entry(&rec);
}
+
+ /*
+ * If this is a bank this CPU monitors via CMCI,
+ * update the threshold.
+ */
+ if (PCPU_GET(cmci_mask) & (1 << i))
+ cmci_update(mode, i, valid, &rec);
}
- return (mcip ? recoverable : count);
+ return (mode == MCE ? recoverable : count);
}
/*
@@ -457,7 +568,7 @@ mca_scan_cpus(void *context, int pending)
continue;
sched_bind(td, cpu);
thread_unlock(td);
- count += mca_scan(0);
+ count += mca_scan(POLLED);
thread_lock(td);
sched_unbind(td);
}
@@ -511,7 +622,24 @@ mca_startup(void *dummy)
SYSINIT(mca_startup, SI_SUB_SMP, SI_ORDER_ANY, mca_startup, NULL);
static void
-mca_setup(void)
+cmci_setup(uint64_t mcg_cap)
+{
+ int i;
+
+ cmc_state = malloc((mp_maxid + 1) * sizeof(struct cmc_state **),
+ M_MCA, M_WAITOK);
+ cmc_banks = mcg_cap & MCG_CAP_COUNT;
+ for (i = 0; i <= mp_maxid; i++)
+ cmc_state[i] = malloc(sizeof(struct cmc_state) * cmc_banks,
+ M_MCA, M_WAITOK | M_ZERO);
+ SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
+ "cmc_throttle", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
+ &cmc_throttle, 0, sysctl_positive_int, "I",
+ "Interval in seconds to throttle corrected MC interrupts");
+}
+
+static void
+mca_setup(uint64_t mcg_cap)
{
mtx_init(&mca_lock, "mca", NULL, MTX_SPIN);
@@ -522,13 +650,62 @@ mca_setup(void)
"count", CTLFLAG_RD, &mca_count, 0, "Record count");
SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
"interval", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, &mca_ticks,
- 0, sysctl_mca_ticks, "I",
+ 0, sysctl_positive_int, "I",
"Periodic interval in seconds to scan for machine checks");
SYSCTL_ADD_NODE(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
"records", CTLFLAG_RD, sysctl_mca_records, "Machine check records");
SYSCTL_ADD_PROC(NULL, SYSCTL_STATIC_CHILDREN(_hw_mca), OID_AUTO,
"force_scan", CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, 0,
sysctl_mca_scan, "I", "Force an immediate scan for machine checks");
+ if (mcg_cap & MCG_CAP_CMCI_P)
+ cmci_setup(mcg_cap);
+}
+
+/*
+ * See if we should monitor CMCI for this bank. If CMCI_EN is already
+ * set in MC_CTL2, then another CPU is responsible for this bank, so
+ * ignore it. If CMCI_EN returns zero after being set, then this bank
+ * does not support CMCI_EN. If this CPU sets CMCI_EN, then it should
+ * now monitor this bank.
+ */
+static void
+cmci_monitor(int i)
+{
+ struct cmc_state *cc;
+ uint64_t ctl;
+
+ KASSERT(i < cmc_banks, ("CPU %d has more MC banks", PCPU_GET(cpuid)));
+
+ ctl = rdmsr(MSR_MC_CTL2(i));
+ if (ctl & MC_CTL2_CMCI_EN)
+ /* Already monitored by another CPU. */
+ return;
+
+ /* Set the threshold to one event for now. */
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= MC_CTL2_CMCI_EN | 1;
+ wrmsr(MSR_MC_CTL2(i), ctl);
+ ctl = rdmsr(MSR_MC_CTL2(i));
+ if (!(ctl & MC_CTL2_CMCI_EN))
+ /* This bank does not support CMCI. */
+ return;
+
+ cc = &cmc_state[PCPU_GET(cpuid)][i];
+
+ /* Determine maximum threshold. */
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= 0x7fff;
+ wrmsr(MSR_MC_CTL2(i), ctl);
+ ctl = rdmsr(MSR_MC_CTL2(i));
+ cc->max_threshold = ctl & MC_CTL2_THRESHOLD;
+
+ /* Start off with a threshold of 1. */
+ ctl &= ~MC_CTL2_THRESHOLD;
+ ctl |= 1;
+ wrmsr(MSR_MC_CTL2(i), ctl);
+
+ /* Mark this bank as monitored. */
+ PCPU_SET(cmci_mask, PCPU_GET(cmci_mask) | 1 << i);
}
/* Must be executed on each CPU. */
@@ -554,14 +731,14 @@ mca_init(void)
workaround_erratum383 = 1;
if (cpu_feature & CPUID_MCA) {
- if (PCPU_GET(cpuid) == 0)
- mca_setup();
+ PCPU_SET(cmci_mask, 0);
- sched_pin();
mcg_cap = rdmsr(MSR_MCG_CAP);
if (mcg_cap & MCG_CAP_CTL_P)
/* Enable MCA features. */
wrmsr(MSR_MCG_CTL, MCG_CTL_ENABLE);
+ if (PCPU_GET(cpuid) == 0)
+ mca_setup(mcg_cap);
/*
* Disable logging of level one TLB parity (L1TP) errors by
@@ -597,10 +774,16 @@ mca_init(void)
if (!skip)
wrmsr(MSR_MC_CTL(i), ctl);
+
+ if (mcg_cap & MCG_CAP_CMCI_P)
+ cmci_monitor(i);
+
/* Clear all errors. */
wrmsr(MSR_MC_STATUS(i), 0);
}
- sched_unpin();
+
+ if (PCPU_GET(cmci_mask) != 0)
+ lapic_enable_cmc();
}
load_cr4(rcr4() | CR4_MCE);
@@ -624,7 +807,7 @@ mca_intr(void)
}
/* Scan the banks and check for any non-recoverable errors. */
- recoverable = mca_scan(1);
+ recoverable = mca_scan(MCE);
mcg_status = rdmsr(MSR_MCG_STATUS);
if (!(mcg_status & MCG_STATUS_RIPV))
recoverable = 0;
@@ -633,3 +816,31 @@ mca_intr(void)
wrmsr(MSR_MCG_STATUS, mcg_status & ~MCG_STATUS_MCIP);
return (recoverable);
}
+
+/* Called for a CMCI (correctable machine check interrupt). */
+void
+cmc_intr(void)
+{
+ struct mca_internal *mca;
+ int count;
+
+ /*
+ * Serialize MCA bank scanning to prevent collisions from
+ * sibling threads.
+ */
+ count = mca_scan(CMCI);
+
+ /* If we found anything, log them to the console. */
+ if (count != 0) {
+ mtx_lock_spin(&mca_lock);
+ STAILQ_FOREACH(mca, &mca_records, link) {
+ if (!mca->logged) {
+ mca->logged = 1;
+ mtx_unlock_spin(&mca_lock);
+ mca_log(&mca->rec);
+ mtx_lock_spin(&mca_lock);
+ }
+ }
+ mtx_unlock_spin(&mca_lock);
+ }
+}
diff --git a/sys/i386/include/apicreg.h b/sys/i386/include/apicreg.h
index 33f293b..fee629b 100644
--- a/sys/i386/include/apicreg.h
+++ b/sys/i386/include/apicreg.h
@@ -89,7 +89,7 @@
* 2C0 Reserved
* 2D0 Reserved
* 2E0 Reserved
- * 2F0 Reserved
+ * 2F0 Local Vector Table (CMCI) R/W
* 300 ICR_LOW Interrupt Command Reg. (0-31) R/W
* 310 ICR_HI Interrupt Command Reg. (32-63) R/W
* 320 Local Vector Table (Timer) R/W
@@ -172,7 +172,7 @@ struct LAPIC {
/* reserved */ PAD4;
/* reserved */ PAD4;
/* reserved */ PAD4;
- /* reserved */ PAD4;
+ u_int32_t lvt_cmci; PAD3;
u_int32_t icr_lo; PAD3;
u_int32_t icr_hi; PAD3;
u_int32_t lvt_timer; PAD3;
diff --git a/sys/i386/include/apicvar.h b/sys/i386/include/apicvar.h
index adfc1e8..f917357 100644
--- a/sys/i386/include/apicvar.h
+++ b/sys/i386/include/apicvar.h
@@ -108,7 +108,8 @@
#define APIC_LOCAL_INTS 240
#define APIC_ERROR_INT APIC_LOCAL_INTS
#define APIC_THERMAL_INT (APIC_LOCAL_INTS + 1)
-#define APIC_IPI_INTS (APIC_LOCAL_INTS + 2)
+#define APIC_CMC_INT (APIC_LOCAL_INTS + 2)
+#define APIC_IPI_INTS (APIC_LOCAL_INTS + 3)
#define IPI_RENDEZVOUS (APIC_IPI_INTS) /* Inter-CPU rendezvous. */
#define IPI_INVLTLB (APIC_IPI_INTS + 1) /* TLB Shootdown IPIs */
@@ -135,7 +136,8 @@
#define APIC_LOCAL_INTS 240
#define APIC_ERROR_INT APIC_LOCAL_INTS
#define APIC_THERMAL_INT (APIC_LOCAL_INTS + 1)
-#define APIC_IPI_INTS (APIC_LOCAL_INTS + 2)
+#define APIC_CMC_INT (APIC_LOCAL_INTS + 2)
+#define APIC_IPI_INTS (APIC_LOCAL_INTS + 3)
#define IPI_RENDEZVOUS (APIC_IPI_INTS) /* Inter-CPU rendezvous. */
#define IPI_INVLTLB (APIC_IPI_INTS + 1) /* TLB Shootdown IPIs */
@@ -170,7 +172,8 @@
#define LVT_ERROR 3
#define LVT_PMC 4
#define LVT_THERMAL 5
-#define LVT_MAX LVT_THERMAL
+#define LVT_CMCI 6
+#define LVT_MAX LVT_CMCI
#ifndef LOCORE
@@ -206,8 +209,8 @@ struct apic_enumerator {
inthand_t
IDTVEC(apic_isr1), IDTVEC(apic_isr2), IDTVEC(apic_isr3),
IDTVEC(apic_isr4), IDTVEC(apic_isr5), IDTVEC(apic_isr6),
- IDTVEC(apic_isr7), IDTVEC(errorint), IDTVEC(spuriousint),
- IDTVEC(timerint);
+ IDTVEC(apic_isr7), IDTVEC(cmcint), IDTVEC(errorint),
+ IDTVEC(spuriousint), IDTVEC(timerint);
extern vm_paddr_t lapic_paddr;
extern int apic_cpuids[];
@@ -237,6 +240,7 @@ void lapic_create(u_int apic_id, int boot_cpu);
void lapic_disable(void);
void lapic_disable_pmc(void);
void lapic_dump(const char *str);
+void lapic_enable_cmc(void);
int lapic_enable_pmc(void);
void lapic_eoi(void);
int lapic_id(void);
@@ -245,6 +249,7 @@ int lapic_intr_pending(u_int vector);
void lapic_ipi_raw(register_t icrlo, u_int dest);
void lapic_ipi_vectored(u_int vector, int dest);
int lapic_ipi_wait(int delay);
+void lapic_handle_cmc(void);
void lapic_handle_error(void);
void lapic_handle_intr(int vector, struct trapframe *frame);
void lapic_handle_timer(struct trapframe *frame);
diff --git a/sys/i386/include/mca.h b/sys/i386/include/mca.h
index bc09480..951750f 100644
--- a/sys/i386/include/mca.h
+++ b/sys/i386/include/mca.h
@@ -46,6 +46,7 @@ struct mca_record {
#ifdef _KERNEL
+void cmc_intr(void);
void mca_init(void);
int mca_intr(void);
diff --git a/sys/i386/include/pcpu.h b/sys/i386/include/pcpu.h
index 6cc03d0..5345eb6 100644
--- a/sys/i386/include/pcpu.h
+++ b/sys/i386/include/pcpu.h
@@ -76,6 +76,7 @@ struct shadow_time_info {
u_int pc_acpi_id; /* ACPI CPU id */ \
u_int pc_apic_id; \
int pc_private_tss; /* Flag indicating private tss*/\
+ u_int pc_cmci_mask; /* MCx banks for CMCI */ \
u_int pc_cr3; /* track cr3 for R1/R3*/ \
u_int pc_pdir; \
u_int pc_lazypmap; \
@@ -102,7 +103,8 @@ struct shadow_time_info {
int pc_currentldt; \
u_int pc_acpi_id; /* ACPI CPU id */ \
u_int pc_apic_id; \
- int pc_private_tss /* Flag indicating private tss */
+ int pc_private_tss; /* Flag indicating private tss*/\
+ u_int pc_cmci_mask /* MCx banks for CMCI */ \
#endif
diff --git a/sys/i386/include/specialreg.h b/sys/i386/include/specialreg.h
index efcf924..b550574 100644
--- a/sys/i386/include/specialreg.h
+++ b/sys/i386/include/specialreg.h
@@ -454,7 +454,7 @@
#define MC_STATUS_VAL 0x8000000000000000
#define MC_MISC_RA_LSB 0x000000000000003f /* If MCG_CAP_SER_P */
#define MC_MISC_ADDRESS_MODE 0x00000000000001c0 /* If MCG_CAP_SER_P */
-#define MC_CTL2_THRESHOLD 0x0000000000003fff
+#define MC_CTL2_THRESHOLD 0x0000000000007fff
#define MC_CTL2_CMCI_EN 0x0000000040000000
/*
diff --git a/sys/x86/x86/local_apic.c b/sys/x86/x86/local_apic.c
index f40026b..7fec4f6 100644
--- a/sys/x86/x86/local_apic.c
+++ b/sys/x86/x86/local_apic.c
@@ -59,6 +59,7 @@ __FBSDID("$FreeBSD$");
#include <machine/frame.h>
#include <machine/intr_machdep.h>
#include <machine/apicvar.h>
+#include <machine/mca.h>
#include <machine/md_var.h>
#include <machine/smp.h>
#include <machine/specialreg.h>
@@ -130,6 +131,7 @@ static struct lvt lvts[LVT_MAX + 1] = {
{ 1, 1, 0, 1, APIC_LVT_DM_FIXED, APIC_ERROR_INT }, /* Error */
{ 1, 1, 1, 1, APIC_LVT_DM_NMI, 0 }, /* PMC */
{ 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_THERMAL_INT }, /* Thermal */
+ { 1, 1, 1, 1, APIC_LVT_DM_FIXED, APIC_CMC_INT }, /* CMCI */
};
static inthand_t *ioint_handlers[] = {
@@ -235,6 +237,9 @@ lapic_init(vm_paddr_t addr)
setidt(APIC_ERROR_INT, IDTVEC(errorint), SDT_APIC, SEL_KPL, GSEL_APIC);
/* XXX: Thermal interrupt */
+
+ /* Local APIC CMCI. */
+ setidt(APIC_CMC_INT, IDTVEC(cmcint), SDT_APICT, SEL_KPL, GSEL_APIC);
}
/*
@@ -260,7 +265,7 @@ lapic_create(u_int apic_id, int boot_cpu)
*/
lapics[apic_id].la_present = 1;
lapics[apic_id].la_id = apic_id;
- for (i = 0; i < LVT_MAX; i++) {
+ for (i = 0; i <= LVT_MAX; i++) {
lapics[apic_id].la_lvts[i] = lvts[i];
lapics[apic_id].la_lvts[i].lvt_active = 0;
}
@@ -290,6 +295,7 @@ lapic_dump(const char* str)
printf(" timer: 0x%08x therm: 0x%08x err: 0x%08x pmc: 0x%08x\n",
lapic->lvt_timer, lapic->lvt_thermal, lapic->lvt_error,
lapic->lvt_pcint);
+ printf(" cmci: 0x%08x\n", lapic->lvt_cmci);
}
void
@@ -341,6 +347,10 @@ lapic_setup(int boot)
/* XXX: Thermal LVT */
+ /* Program the CMCI LVT entry if present. */
+ if (maxlvt >= LVT_CMCI)
+ lapic->lvt_cmci = lvt_mode(la, LVT_CMCI, lapic->lvt_cmci);
+
intr_restore(eflags);
}
@@ -838,6 +848,34 @@ lapic_timer_enable_intr(void)
}
void
+lapic_handle_cmc(void)
+{
+
+ lapic_eoi();
+ cmc_intr();
+}
+
+/*
+ * Called from the mca_init() to activate the CMC interrupt if this CPU is
+ * responsible for monitoring any MC banks for CMC events. Since mca_init()
+ * is called prior to lapic_setup() during boot, this just needs to unmask
+ * this CPU's LVT_CMCI entry.
+ */
+void
+lapic_enable_cmc(void)
+{
+ u_int apic_id;
+
+ apic_id = PCPU_GET(apic_id);
+ KASSERT(lapics[apic_id].la_present,
+ ("%s: missing APIC %u", __func__, apic_id));
+ lapics[apic_id].la_lvts[LVT_CMCI].lvt_masked = 0;
+ lapics[apic_id].la_lvts[LVT_CMCI].lvt_active = 1;
+ if (bootverbose)
+ printf("lapic%u: CMCI unmasked\n", apic_id);
+}
+
+void
lapic_handle_error(void)
{
u_int32_t esr;
OpenPOWER on IntegriCloud