From 12a6611fa16e9c6d2f844fe2175d219c6e9bd95d Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:01 -0500 Subject: x86, UV: Calculate BAU destination timeout Calculate the Broadcast Assist Unit's destination timeout period from the values in the relevant MMR's. Store it in each cpu's per-cpu BAU structure so that a destination timeout can be differentiated from a 'plugged' situation in which all software ack resources are already allocated and a timeout is pending. That case returns an immediate destination error. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 51 ++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 47 insertions(+), 4 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 7fea555..5506836 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -30,6 +30,19 @@ struct msg_desc { struct bau_payload_queue_entry *va_queue_last; }; +/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */ +static int timeout_base_ns[] = { + 20, + 160, + 1280, + 10240, + 81920, + 655360, + 5242880, + 167772160 +}; +static int timeout_us; + #define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL static int uv_bau_max_concurrent __read_mostly; @@ -423,7 +436,8 @@ static int uv_wait_completion(struct bau_desc *bau_desc, * pending. In that case hardware returns the * ERROR that looks like a destination timeout. */ - if (cycles_2_us(ttime - bcp->send_message) < BIOS_TO) { + if (cycles_2_us(ttime - bcp->send_message) < + timeout_us) { bcp->conseccompletes = 0; return FLUSH_RETRY_PLUGGED; } @@ -908,12 +922,12 @@ static void uv_ptc_seq_stop(struct seq_file *file, void *data) } static inline unsigned long long -millisec_2_cycles(unsigned long millisec) +microsec_2_cycles(unsigned long microsec) { unsigned long ns; unsigned long long cyc; - ns = millisec * 1000; + ns = microsec * 1000; cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id())); return cyc; } @@ -1259,6 +1273,33 @@ static void __init uv_init_uvhub(int uvhub, int vector) } /* + * We will set BAU_MISC_CONTROL with a timeout period. + * But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT. + * So the destination timeout period has be be calculated from them. + */ +static int +calculate_destination_timeout(void) +{ + unsigned long mmr_image; + int mult1; + int mult2; + int index; + int base; + int ret; + unsigned long ts_ns; + + mult1 = UV_INTD_SOFT_ACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK; + mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL); + index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK; + mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT); + mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK; + base = timeout_base_ns[index]; + ts_ns = base * mult1 * mult2; + ret = ts_ns / 1000; + return ret; +} + +/* * initialize the bau_control structure for each cpu */ static void uv_init_per_cpu(int nuvhubs) @@ -1286,6 +1327,8 @@ static void uv_init_per_cpu(int nuvhubs) }; struct uvhub_desc *uvhub_descs; + timeout_us = calculate_destination_timeout(); + uvhub_descs = (struct uvhub_desc *) kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL); memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc)); @@ -1301,7 +1344,7 @@ static void uv_init_per_cpu(int nuvhubs) bdp->uvhub = uvhub; bdp->pnode = pnode; /* time interval to catch a hardware stay-busy bug */ - bcp->timeout_interval = millisec_2_cycles(3); + bcp->timeout_interval = microsec_2_cycles(2*timeout_us); /* kludge: assume uv_hub.h is constant */ socket = (cpu_physical_id(cpu)>>5)&1; if (socket >= bdp->num_sockets) -- cgit v1.1 From e8e5e8a8048006a12d7777a93baebd6e39496101 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:01 -0500 Subject: x86, UV: BAU tunables into a debugfs file Make the Broadcast Assist Unit driver's nine tuning values variable by making them accessible through a read/write debugfs file. The file will normally be mounted as /sys/kernel/debug/sgi_uv/bau_tunables. The tunables are kept in each cpu's per-cpu BAU structure. The patch also does a little name improvement, and corrects the reset of two destination timeout counters. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 281 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 241 insertions(+), 40 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 5506836..c866177 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -8,6 +8,7 @@ */ #include #include +#include #include #include @@ -42,12 +43,22 @@ static int timeout_base_ns[] = { 167772160 }; static int timeout_us; +static int nobau; -#define UV_INTD_SOFT_ACK_TIMEOUT_PERIOD 0x000000000bUL - -static int uv_bau_max_concurrent __read_mostly; +/* tunables: */ +static int max_bau_concurrent = MAX_BAU_CONCURRENT; +static int max_bau_concurrent_constant = MAX_BAU_CONCURRENT; +static int plugged_delay = PLUGGED_DELAY; +static int plugsb4reset = PLUGSB4RESET; +static int timeoutsb4reset = TIMEOUTSB4RESET; +static int ipi_reset_limit = IPI_RESET_LIMIT; +static int complete_threshold = COMPLETE_THRESHOLD; +static int congested_response_us = CONGESTED_RESPONSE_US; +static int congested_reps = CONGESTED_REPS; +static int congested_period = CONGESTED_PERIOD; +static struct dentry *tunables_dir; +static struct dentry *tunables_file; -static int nobau; static int __init setup_nobau(char *arg) { nobau = 1; @@ -539,23 +550,24 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, unsigned long index; cycles_t time1; cycles_t time2; + cycles_t elapsed; struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu); struct bau_control *smaster = bcp->socket_master; struct bau_control *hmaster = bcp->uvhub_master; /* - * Spin here while there are hmaster->max_concurrent or more active + * Spin here while there are hmaster->max_bau_concurrent or more active * descriptors. This is the per-uvhub 'throttle'. */ if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, &hmaster->active_descriptor_count, - hmaster->max_concurrent)) { + hmaster->max_bau_concurrent)) { stat->s_throttles++; do { cpu_relax(); } while (!atomic_inc_unless_ge(&hmaster->uvhub_lock, &hmaster->active_descriptor_count, - hmaster->max_concurrent)); + hmaster->max_bau_concurrent)); } while (hmaster->uvhub_quiesce) @@ -609,9 +621,9 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, * that case hardware immediately returns the ERROR * that looks like a destination timeout. */ - udelay(TIMEOUT_DELAY); + udelay(bcp->plugged_delay); bcp->plugged_tries++; - if (bcp->plugged_tries >= PLUGSB4RESET) { + if (bcp->plugged_tries >= bcp->plugsb4reset) { bcp->plugged_tries = 0; quiesce_local_uvhub(hmaster); spin_lock(&hmaster->queue_lock); @@ -623,10 +635,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, stat->s_resets_plug++; } } else if (completion_status == FLUSH_RETRY_TIMEOUT) { - hmaster->max_concurrent = 1; + hmaster->max_bau_concurrent = 1; bcp->timeout_tries++; udelay(TIMEOUT_DELAY); - if (bcp->timeout_tries >= TIMEOUTSB4RESET) { + if (bcp->timeout_tries >= bcp->timeoutsb4reset) { bcp->timeout_tries = 0; quiesce_local_uvhub(hmaster); spin_lock(&hmaster->queue_lock); @@ -638,7 +650,7 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, stat->s_resets_timeout++; } } - if (bcp->ipi_attempts >= 3) { + if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { bcp->ipi_attempts = 0; completion_status = FLUSH_GIVEUP; break; @@ -648,9 +660,14 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, (completion_status == FLUSH_RETRY_TIMEOUT)); time2 = get_cycles(); - if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > 5) - && (hmaster->max_concurrent < hmaster->max_concurrent_constant)) - hmaster->max_concurrent++; + bcp->plugged_tries = 0; + bcp->timeout_tries = 0; + + if ((completion_status == FLUSH_COMPLETE) && + (bcp->conseccompletes > bcp->complete_threshold) && + (hmaster->max_bau_concurrent < + hmaster->max_bau_concurrent_constant)) + hmaster->max_bau_concurrent++; /* * hold any cpu not timing out here; no other cpu currently held by @@ -661,9 +678,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, atomic_dec(&hmaster->active_descriptor_count); /* guard against cycles wrap */ - if (time2 > time1) - stat->s_time += (time2 - time1); - else + if (time2 > time1) { + elapsed = time2 - time1; + stat->s_time += elapsed; + } else stat->s_requestor--; /* don't count this one */ if (completion_status == FLUSH_COMPLETE && try > 1) stat->s_retriesok++; @@ -730,10 +748,12 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct ptc_stats *stat; struct bau_control *bcp; + /* kernel was booted 'nobau' */ if (nobau) return cpumask; bcp = &per_cpu(bau_control, cpu); + /* * Each sending cpu has a per-cpu mask which it fills from the caller's * cpu mask. Only remote cpus are converted to uvhubs and copied. @@ -970,6 +990,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) stat->s_resets_plug, stat->s_resets_timeout, stat->s_giveup, stat->s_stimeout, stat->s_busy, stat->s_throttles); + /* destination side statistics */ seq_printf(file, "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", @@ -986,9 +1007,28 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) } /* + * Display the tunables thru debugfs + */ +static ssize_t tunables_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + char buf[300]; + int ret; + + ret = snprintf(buf, 300, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n", + "max_bau_concurrent plugged_delay plugsb4reset", + "timeoutsb4reset ipi_reset_limit complete_threshold", + "congested_response_us congested_reps congested_period", + max_bau_concurrent, plugged_delay, plugsb4reset, + timeoutsb4reset, ipi_reset_limit, complete_threshold, + congested_response_us, congested_reps, congested_period); + + return simple_read_from_buffer(userbuf, count, ppos, buf, ret); +} + +/* * -1: resetf the statistics * 0: display meaning of the statistics - * >0: maximum concurrent active descriptors per uvhub (throttle) */ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, size_t count, loff_t *data) @@ -997,7 +1037,6 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, long input_arg; char optstr[64]; struct ptc_stats *stat; - struct bau_control *bcp; if (count == 0 || count > sizeof(optstr)) return -EINVAL; @@ -1078,24 +1117,149 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, stat = &per_cpu(ptcstats, cpu); memset(stat, 0, sizeof(struct ptc_stats)); } - } else { - uv_bau_max_concurrent = input_arg; - bcp = &per_cpu(bau_control, smp_processor_id()); - if (uv_bau_max_concurrent < 1 || - uv_bau_max_concurrent > bcp->cpus_in_uvhub) { - printk(KERN_DEBUG - "Error: BAU max concurrent %d; %d is invalid\n", - bcp->max_concurrent, uv_bau_max_concurrent); - return -EINVAL; - } - printk(KERN_DEBUG "Set BAU max concurrent:%d\n", - uv_bau_max_concurrent); - for_each_present_cpu(cpu) { - bcp = &per_cpu(bau_control, cpu); - bcp->max_concurrent = uv_bau_max_concurrent; + } + + return count; +} + +static int local_atoi(const char *name) +{ + int val = 0; + + for (;; name++) { + switch (*name) { + case '0' ... '9': + val = 10*val+(*name-'0'); + break; + default: + return val; } } +} + +/* + * set the tunables + * 0 values reset them to defaults + */ +static ssize_t tunables_write(struct file *file, const char __user *user, + size_t count, loff_t *data) +{ + int cpu; + int cnt = 0; + int val; + char *p; + char *q; + char instr[64]; + struct bau_control *bcp; + if (count == 0 || count > sizeof(instr)-1) + return -EINVAL; + if (copy_from_user(instr, user, count)) + return -EFAULT; + + instr[count] = '\0'; + /* count the fields */ + p = instr + strspn(instr, WHITESPACE); + q = p; + for (; *p; p = q + strspn(q, WHITESPACE)) { + q = p + strcspn(p, WHITESPACE); + cnt++; + if (q == p) + break; + } + if (cnt != 9) { + printk(KERN_INFO "bau tunable error: should be 9 numbers\n"); + return -EINVAL; + } + + p = instr + strspn(instr, WHITESPACE); + q = p; + for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) { + q = p + strcspn(p, WHITESPACE); + val = local_atoi(p); + switch (cnt) { + case 0: + if (val == 0) { + max_bau_concurrent = MAX_BAU_CONCURRENT; + max_bau_concurrent_constant = + MAX_BAU_CONCURRENT; + continue; + } + bcp = &per_cpu(bau_control, smp_processor_id()); + if (val < 1 || val > bcp->cpus_in_uvhub) { + printk(KERN_DEBUG + "Error: BAU max concurrent %d is invalid\n", + val); + return -EINVAL; + } + max_bau_concurrent = val; + max_bau_concurrent_constant = val; + continue; + case 1: + if (val == 0) + plugged_delay = PLUGGED_DELAY; + else + plugged_delay = val; + continue; + case 2: + if (val == 0) + plugsb4reset = PLUGSB4RESET; + else + plugsb4reset = val; + continue; + case 3: + if (val == 0) + timeoutsb4reset = TIMEOUTSB4RESET; + else + timeoutsb4reset = val; + continue; + case 4: + if (val == 0) + ipi_reset_limit = IPI_RESET_LIMIT; + else + ipi_reset_limit = val; + continue; + case 5: + if (val == 0) + complete_threshold = COMPLETE_THRESHOLD; + else + complete_threshold = val; + continue; + case 6: + if (val == 0) + congested_response_us = CONGESTED_RESPONSE_US; + else + congested_response_us = val; + continue; + case 7: + if (val == 0) + congested_reps = CONGESTED_REPS; + else + congested_reps = val; + continue; + case 8: + if (val == 0) + congested_period = CONGESTED_PERIOD; + else + congested_period = val; + continue; + } + if (q == p) + break; + } + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); + bcp->max_bau_concurrent = max_bau_concurrent; + bcp->max_bau_concurrent_constant = max_bau_concurrent; + bcp->plugged_delay = plugged_delay; + bcp->plugsb4reset = plugsb4reset; + bcp->timeoutsb4reset = timeoutsb4reset; + bcp->ipi_reset_limit = ipi_reset_limit; + bcp->complete_threshold = complete_threshold; + bcp->congested_response_us = congested_response_us; + bcp->congested_reps = congested_reps; + bcp->congested_period = congested_period; + } return count; } @@ -1111,6 +1275,11 @@ static int uv_ptc_proc_open(struct inode *inode, struct file *file) return seq_open(file, &uv_ptc_seq_ops); } +static int tunables_open(struct inode *inode, struct file *file) +{ + return 0; +} + static const struct file_operations proc_uv_ptc_operations = { .open = uv_ptc_proc_open, .read = seq_read, @@ -1119,6 +1288,12 @@ static const struct file_operations proc_uv_ptc_operations = { .release = seq_release, }; +static const struct file_operations tunables_fops = { + .open = tunables_open, + .read = tunables_read, + .write = tunables_write, +}; + static int __init uv_ptc_init(void) { struct proc_dir_entry *proc_uv_ptc; @@ -1133,6 +1308,20 @@ static int __init uv_ptc_init(void) UV_PTC_BASENAME); return -EINVAL; } + + tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL); + if (!tunables_dir) { + printk(KERN_ERR "unable to create debugfs directory %s\n", + UV_BAU_TUNABLES_DIR); + return -EINVAL; + } + tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600, + tunables_dir, NULL, &tunables_fops); + if (!tunables_file) { + printk(KERN_ERR "unable to create debugfs file %s\n", + UV_BAU_TUNABLES_FILE); + return -EINVAL; + } return 0; } @@ -1336,15 +1525,12 @@ static void uv_init_per_cpu(int nuvhubs) bcp = &per_cpu(bau_control, cpu); memset(bcp, 0, sizeof(struct bau_control)); spin_lock_init(&bcp->masks_lock); - bcp->max_concurrent = uv_bau_max_concurrent; pnode = uv_cpu_hub_info(cpu)->pnode; uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; bdp = &uvhub_descs[uvhub]; bdp->num_cpus++; bdp->uvhub = uvhub; bdp->pnode = pnode; - /* time interval to catch a hardware stay-busy bug */ - bcp->timeout_interval = microsec_2_cycles(2*timeout_us); /* kludge: assume uv_hub.h is constant */ socket = (cpu_physical_id(cpu)>>5)&1; if (socket >= bdp->num_sockets) @@ -1380,6 +1566,21 @@ static void uv_init_per_cpu(int nuvhubs) } } kfree(uvhub_descs); + for_each_present_cpu(cpu) { + bcp = &per_cpu(bau_control, cpu); + /* time interval to catch a hardware stay-busy bug */ + bcp->timeout_interval = microsec_2_cycles(2*timeout_us); + bcp->max_bau_concurrent = max_bau_concurrent; + bcp->max_bau_concurrent_constant = max_bau_concurrent; + bcp->plugged_delay = plugged_delay; + bcp->plugsb4reset = plugsb4reset; + bcp->timeoutsb4reset = timeoutsb4reset; + bcp->ipi_reset_limit = ipi_reset_limit; + bcp->complete_threshold = complete_threshold; + bcp->congested_response_us = congested_response_us; + bcp->congested_reps = congested_reps; + bcp->congested_period = congested_period; + } } /* @@ -1404,7 +1605,7 @@ static int __init uv_bau_init(void) zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), GFP_KERNEL, cpu_to_node(cur_cpu)); - uv_bau_max_concurrent = MAX_BAU_CONCURRENT; + max_bau_concurrent = MAX_BAU_CONCURRENT; uv_nshift = uv_hub_info->m_val; uv_mmask = (1UL << uv_hub_info->m_val) - 1; nuvhubs = uv_num_possible_blades(); @@ -1437,4 +1638,4 @@ static int __init uv_bau_init(void) return 0; } core_initcall(uv_bau_init); -core_initcall(uv_ptc_init); +fs_initcall(uv_ptc_init); -- cgit v1.1 From 50fb55acc5bbe5ee29d0a65262f4ec286b14d156 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Disable BAU on network congestion The numalink network can become so congested that TLB shootdown using the Broadcast Assist Unit becomes slower than using IPI's. In that case, disable the use of the BAU for a period of time. The period is tunable. When the period expires the use of the BAU is re-enabled. A count of these actions is added to the statistics file. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 76 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 73 insertions(+), 3 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index c866177..dc6a683 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -44,6 +44,9 @@ static int timeout_base_ns[] = { }; static int timeout_us; static int nobau; +static int baudisabled; +static spinlock_t disable_lock; +static cycles_t congested_cycles; /* tunables: */ static int max_bau_concurrent = MAX_BAU_CONCURRENT; @@ -519,6 +522,35 @@ static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) return 1; } +/* + * Completions are taking a very long time due to a congested numalink + * network. + */ +static void +disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat) +{ + int tcpu; + struct bau_control *tbcp; + + /* let only one cpu do this disabling */ + spin_lock(&disable_lock); + if (!baudisabled && bcp->period_requests && + ((bcp->period_time / bcp->period_requests) > congested_cycles)) { + /* it becomes this cpu's job to turn on the use of the + BAU again */ + baudisabled = 1; + bcp->set_bau_off = 1; + bcp->set_bau_on_time = get_cycles() + + sec_2_cycles(bcp->congested_period); + stat->s_bau_disabled++; + for_each_present_cpu(tcpu) { + tbcp = &per_cpu(bau_control, tcpu); + tbcp->baudisabled = 1; + } + } + spin_unlock(&disable_lock); +} + /** * uv_flush_send_and_wait * @@ -681,6 +713,14 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, if (time2 > time1) { elapsed = time2 - time1; stat->s_time += elapsed; + if ((completion_status == FLUSH_COMPLETE) && (try == 1)) { + bcp->period_requests++; + bcp->period_time += elapsed; + if ((elapsed > congested_cycles) && + (bcp->period_requests > bcp->congested_reps)) { + disable_for_congestion(bcp, stat); + } + } } else stat->s_requestor--; /* don't count this one */ if (completion_status == FLUSH_COMPLETE && try > 1) @@ -747,12 +787,32 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct cpumask *flush_mask; struct ptc_stats *stat; struct bau_control *bcp; + struct bau_control *tbcp; /* kernel was booted 'nobau' */ if (nobau) return cpumask; bcp = &per_cpu(bau_control, cpu); + stat = &per_cpu(ptcstats, cpu); + + /* bau was disabled due to slow response */ + if (bcp->baudisabled) { + /* the cpu that disabled it must re-enable it */ + if (bcp->set_bau_off) { + if (get_cycles() >= bcp->set_bau_on_time) { + stat->s_bau_reenabled++; + baudisabled = 0; + for_each_present_cpu(tcpu) { + tbcp = &per_cpu(bau_control, tcpu); + tbcp->baudisabled = 0; + tbcp->period_requests = 0; + tbcp->period_time = 0; + } + } + } + return cpumask; + } /* * Each sending cpu has a per-cpu mask which it fills from the caller's @@ -793,7 +853,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, else return NULL; } - stat = &per_cpu(ptcstats, cpu); stat->s_requestor++; stat->s_ntargcpu += remotes; remotes = bau_uvhub_weight(&bau_desc->distribution); @@ -973,7 +1032,9 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) seq_printf(file, "sw_ack recv rtime all "); seq_printf(file, - "one mult none retry canc nocan reset rcan\n"); + "one mult none retry canc nocan reset rcan "); + seq_printf(file, + "disable enable\n"); } if (cpu < num_possible_cpus() && cpu_online(cpu)) { stat = &per_cpu(ptcstats, cpu); @@ -993,7 +1054,7 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) /* destination side statistics */ seq_printf(file, - "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", + "%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", uv_read_global_mmr64(uv_cpu_to_pnode(cpu), UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE), stat->d_requestee, cycles_2_us(stat->d_time), @@ -1001,6 +1062,8 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) stat->d_nomsg, stat->d_retries, stat->d_canceled, stat->d_nocanceled, stat->d_resets, stat->d_rcanceled); + seq_printf(file, "%ld %ld\n", + stat->s_bau_disabled, stat->s_bau_reenabled); } return 0; @@ -1112,6 +1175,10 @@ static ssize_t uv_ptc_proc_write(struct file *file, const char __user *user, "reset: number of ipi-style reset requests processed\n"); printk(KERN_DEBUG "rcan: number messages canceled by reset requests\n"); + printk(KERN_DEBUG + "disable: number times use of the BAU was disabled\n"); + printk(KERN_DEBUG + "enable: number times use of the BAU was re-enabled\n"); } else if (input_arg == -1) { for_each_present_cpu(cpu) { stat = &per_cpu(ptcstats, cpu); @@ -1568,6 +1635,7 @@ static void uv_init_per_cpu(int nuvhubs) kfree(uvhub_descs); for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); + bcp->baudisabled = 0; /* time interval to catch a hardware stay-busy bug */ bcp->timeout_interval = microsec_2_cycles(2*timeout_us); bcp->max_bau_concurrent = max_bau_concurrent; @@ -1609,6 +1677,8 @@ static int __init uv_bau_init(void) uv_nshift = uv_hub_info->m_val; uv_mmask = (1UL << uv_hub_info->m_val) - 1; nuvhubs = uv_num_possible_blades(); + spin_lock_init(&disable_lock); + congested_cycles = microsec_2_cycles(congested_response_us); uv_init_per_cpu(nuvhubs); -- cgit v1.1 From 712157aa703a01f58c7c17452096ab00b774d0a9 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Shorten access to BAU statistics structure Use a pointer from the per-cpu BAU control structure to the per-cpu BAU statistics structure. We nearly always know the first before needing the second. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index dc6a683..261b965 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -153,7 +153,7 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, struct ptc_stats *stat; msg = mdp->msg; - stat = &per_cpu(ptcstats, bcp->cpu); + stat = bcp->statp; stat->d_retries++; /* * cancel any message from msg+1 to the retry itself @@ -217,7 +217,7 @@ static void uv_bau_process_message(struct msg_desc *mdp, * This must be a normal message, or retry of a normal message */ msg = mdp->msg; - stat = &per_cpu(ptcstats, bcp->cpu); + stat = bcp->statp; if (msg->address == TLB_FLUSH_ALL) { local_flush_tlb(); stat->d_alltlb++; @@ -301,7 +301,7 @@ uv_do_reset(void *ptr) bcp = &per_cpu(bau_control, smp_processor_id()); rap = (struct reset_args *)ptr; - stat = &per_cpu(ptcstats, bcp->cpu); + stat = bcp->statp; stat->d_resets++; /* @@ -419,7 +419,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc, unsigned long mask; cycles_t ttime; cycles_t timeout_time; - struct ptc_stats *stat = &per_cpu(ptcstats, this_cpu); + struct ptc_stats *stat = bcp->statp; struct bau_control *hmaster; hmaster = bcp->uvhub_master; @@ -583,7 +583,7 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, cycles_t time1; cycles_t time2; cycles_t elapsed; - struct ptc_stats *stat = &per_cpu(ptcstats, bcp->cpu); + struct ptc_stats *stat = bcp->statp; struct bau_control *smaster = bcp->socket_master; struct bau_control *hmaster = bcp->uvhub_master; @@ -794,7 +794,7 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, return cpumask; bcp = &per_cpu(bau_control, cpu); - stat = &per_cpu(ptcstats, cpu); + stat = bcp->statp; /* bau was disabled due to slow response */ if (bcp->baudisabled) { @@ -903,7 +903,7 @@ void uv_bau_message_interrupt(struct pt_regs *regs) time_start = get_cycles(); bcp = &per_cpu(bau_control, smp_processor_id()); - stat = &per_cpu(ptcstats, smp_processor_id()); + stat = bcp->statp; msgdesc.va_queue_first = bcp->va_queue_first; msgdesc.va_queue_last = bcp->va_queue_last; msg = bcp->bau_msg_head; @@ -1636,6 +1636,7 @@ static void uv_init_per_cpu(int nuvhubs) for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); bcp->baudisabled = 0; + bcp->statp = &per_cpu(ptcstats, cpu); /* time interval to catch a hardware stay-busy bug */ bcp->timeout_interval = microsec_2_cycles(2*timeout_us); bcp->max_bau_concurrent = max_bau_concurrent; @@ -1673,7 +1674,6 @@ static int __init uv_bau_init(void) zalloc_cpumask_var_node(&per_cpu(uv_flush_tlb_mask, cur_cpu), GFP_KERNEL, cpu_to_node(cur_cpu)); - max_bau_concurrent = MAX_BAU_CONCURRENT; uv_nshift = uv_hub_info->m_val; uv_mmask = (1UL << uv_hub_info->m_val) - 1; nuvhubs = uv_num_possible_blades(); -- cgit v1.1 From 4faca1550838708d71f6eea14cdacb0876c3a5a4 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: BAU structure rearranging Move some structure definitions from the C code to the BAU header file, and change the organization of that header file a little. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 12 ------------ 1 file changed, 12 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 261b965..d759290 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -23,14 +23,6 @@ #include #include -struct msg_desc { - struct bau_payload_queue_entry *msg; - int msg_slot; - int sw_ack_slot; - struct bau_payload_queue_entry *va_queue_first; - struct bau_payload_queue_entry *va_queue_last; -}; - /* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */ static int timeout_base_ns[] = { 20, @@ -79,10 +71,6 @@ static DEFINE_PER_CPU(struct ptc_stats, ptcstats); static DEFINE_PER_CPU(struct bau_control, bau_control); static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask); -struct reset_args { - int sender; -}; - /* * Determine the first node on a uvhub. 'Nodes' are used for kernel * memory allocation. -- cgit v1.1 From 39847e7f3c8198b14102fe7ba4b3a6a1d84bbcfe Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Correct BAU software acknowledge Correct the acknowledgment and the reset of a BAU software-acknowledged message. A retry message should be testing only for timed-out resources (mask << 8). (And we delete a log message that might cause unnecessary concern) The acknowledge MMR is |--timed-out--|---pending--|, each is 8 bits. The IPI-driven reset of software acknowledge resources frees both timed out and pending resources. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index d759290..295a411 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -161,15 +161,14 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, slot2 = msg2 - mdp->va_queue_first; mmr = uv_read_local_mmr (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); - msg_res = ((msg2->sw_ack_vector << 8) | - msg2->sw_ack_vector); + msg_res = msg2->sw_ack_vector; /* * This is a message retry; clear the resources held * by the previous message only if they timed out. * If it has not timed out we have an unexpected * situation to report. */ - if (mmr & (msg_res << 8)) { + if (mmr & (msg_res << UV_SW_ACK_NPENDING)) { /* * is the resource timed out? * make everyone ignore the cancelled message. @@ -179,9 +178,9 @@ static inline void uv_bau_process_retry_msg(struct msg_desc *mdp, cancel_count++; uv_write_local_mmr( UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, - (msg_res << 8) | msg_res); - } else - printk(KERN_INFO "note bau retry: no effect\n"); + (msg_res << UV_SW_ACK_NPENDING) | + msg_res); + } } } if (!cancel_count) @@ -317,13 +316,13 @@ uv_do_reset(void *ptr) */ mmr = uv_read_local_mmr (UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE); - msg_res = ((msg->sw_ack_vector << 8) | - msg->sw_ack_vector); + msg_res = msg->sw_ack_vector; if (mmr & msg_res) { stat->d_rcanceled++; uv_write_local_mmr( UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE_ALIAS, - msg_res); + (msg_res << UV_SW_ACK_NPENDING) | + msg_res); } } } -- cgit v1.1 From a8328ee58c15c9d763a67607a35bb987b38950fa Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Correct BAU discovery of hubs and sockets Correct the initialization-time assumption of contigous blade numbers and of sockets numbered from zero. There may be hubs present with no cpu's enabled. There may be disabled sockets such that the active socket is not number zero. And assign a 'socket master' by assuming that a socket is a node. (it is not safe to extract socket number from an apicid) Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 49 ++++++++++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 18 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 295a411..ab929e9 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -1547,11 +1547,13 @@ calculate_destination_timeout(void) */ static void uv_init_per_cpu(int nuvhubs) { - int i, j, k; + int i; int cpu; int pnode; int uvhub; short socket = 0; + unsigned short socket_mask; + unsigned int uvhub_mask; struct bau_control *bcp; struct uvhub_desc *bdp; struct socket_desc *sdp; @@ -1562,7 +1564,7 @@ static void uv_init_per_cpu(int nuvhubs) short cpu_number[16]; }; struct uvhub_desc { - short num_sockets; + unsigned short socket_mask; short num_cpus; short uvhub; short pnode; @@ -1581,43 +1583,54 @@ static void uv_init_per_cpu(int nuvhubs) spin_lock_init(&bcp->masks_lock); pnode = uv_cpu_hub_info(cpu)->pnode; uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; + uvhub_mask |= (1 << uvhub); bdp = &uvhub_descs[uvhub]; bdp->num_cpus++; bdp->uvhub = uvhub; bdp->pnode = pnode; - /* kludge: assume uv_hub.h is constant */ - socket = (cpu_physical_id(cpu)>>5)&1; - if (socket >= bdp->num_sockets) - bdp->num_sockets = socket+1; + /* kludge: 'assuming' one node per socket, and assuming that + disabling a socket just leaves a gap in node numbers */ + socket = (cpu_to_node(cpu) & 1);; + bdp->socket_mask |= (1 << socket); sdp = &bdp->socket[socket]; sdp->cpu_number[sdp->num_cpus] = cpu; sdp->num_cpus++; } - socket = 0; - for_each_possible_blade(uvhub) { + uvhub = 0; + while (uvhub_mask) { + if (!(uvhub_mask & 1)) + goto nexthub; bdp = &uvhub_descs[uvhub]; - for (i = 0; i < bdp->num_sockets; i++) { - sdp = &bdp->socket[i]; - for (j = 0; j < sdp->num_cpus; j++) { - cpu = sdp->cpu_number[j]; + socket_mask = bdp->socket_mask; + socket = 0; + while (socket_mask) { + if (!(socket_mask & 1)) + goto nextsocket; + sdp = &bdp->socket[socket]; + for (i = 0; i < sdp->num_cpus; i++) { + cpu = sdp->cpu_number[i]; bcp = &per_cpu(bau_control, cpu); bcp->cpu = cpu; - if (j == 0) { + if (i == 0) { smaster = bcp; - if (i == 0) + if (socket == 0) hmaster = bcp; } bcp->cpus_in_uvhub = bdp->num_cpus; bcp->cpus_in_socket = sdp->num_cpus; bcp->socket_master = smaster; + bcp->uvhub = bdp->uvhub; bcp->uvhub_master = hmaster; - for (k = 0; k < DEST_Q_SIZE; k++) - bcp->socket_acknowledge_count[k] = 0; - bcp->uvhub_cpu = - uv_cpu_hub_info(cpu)->blade_processor_id; + bcp->uvhub_cpu = uv_cpu_hub_info(cpu)-> + blade_processor_id; } +nextsocket: socket++; + socket_mask = (socket_mask >> 1); } +nexthub: + uvhub++; + uvhub_mask = (uvhub_mask >> 1); } kfree(uvhub_descs); for_each_present_cpu(cpu) { -- cgit v1.1 From 90cc7d944981a6d06b49bb26fde1b490e28c90e5 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Remove BAU check for stay-busy Remove a faulty assumption that a long running BAU request has encountered a hardware problem and will never finish. Numalink congestion can make a request appear to have encountered such a problem, but it is not safe to cancel the request. If such a cancel is done but a reply is later received we can miss a TLB shootdown. We depend upon the max_bau_concurrent 'throttle' to prevent the stay-busy case from happening. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 23 ----------------------- 1 file changed, 23 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index ab929e9..dc962b5 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -405,12 +405,10 @@ static int uv_wait_completion(struct bau_desc *bau_desc, unsigned long mmr; unsigned long mask; cycles_t ttime; - cycles_t timeout_time; struct ptc_stats *stat = bcp->statp; struct bau_control *hmaster; hmaster = bcp->uvhub_master; - timeout_time = get_cycles() + bcp->timeout_interval; /* spin on the status MMR, waiting for it to go idle */ while ((descriptor_status = (((unsigned long) @@ -450,26 +448,6 @@ static int uv_wait_completion(struct bau_desc *bau_desc, * descriptor_status is still BUSY */ cpu_relax(); - relaxes++; - if (relaxes >= 10000) { - relaxes = 0; - if (get_cycles() > timeout_time) { - quiesce_local_uvhub(hmaster); - - /* single-thread the register change */ - spin_lock(&hmaster->masks_lock); - mmr = uv_read_local_mmr(mmr_offset); - mask = 0UL; - mask |= (3UL < right_shift); - mask = ~mask; - mmr &= mask; - uv_write_local_mmr(mmr_offset, mmr); - spin_unlock(&hmaster->masks_lock); - end_uvhub_quiesce(hmaster); - stat->s_busy++; - return FLUSH_GIVEUP; - } - } } } bcp->conseccompletes++; @@ -1580,7 +1558,6 @@ static void uv_init_per_cpu(int nuvhubs) for_each_present_cpu(cpu) { bcp = &per_cpu(bau_control, cpu); memset(bcp, 0, sizeof(struct bau_control)); - spin_lock_init(&bcp->masks_lock); pnode = uv_cpu_hub_info(cpu)->pnode; uvhub = uv_cpu_hub_info(cpu)->numa_blade_id; uvhub_mask |= (1 << uvhub); -- cgit v1.1 From 7fba1bcd4844a4a8619a03bf51cabc92aea365a8 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Correct BAU regular message type The Broadcast Assist Unit messages have a regular or retry message type. The regular type was not being set, but needs to be, because the lack of a message type is sometimes used to identify an unused entry in the message queue. Also removing some excess comments. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index dc962b5..4cb14db 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -580,23 +580,10 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, } time1 = get_cycles(); do { - /* - * Every message from any given cpu gets a unique message - * sequence number. But retries use that same number. - * Our message may have timed out at the destination because - * all sw-ack resources are in use and there is a timeout - * pending there. In that case, our last send never got - * placed into the queue and we need to persist until it - * does. - * - * Make any retry a type MSG_RETRY so that the destination will - * free any resource held by a previous message from this cpu. - */ if (try == 0) { - /* use message type set by the caller the first time */ + bau_desc->header.msg_type = MSG_REGULAR; seq_number = bcp->message_number++; } else { - /* use RETRY type on all the rest; same sequence */ bau_desc->header.msg_type = MSG_RETRY; stat->s_retry_messages++; } -- cgit v1.1 From 450a007eebaf430426ea8f89bbc3f287949905b2 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: BAU broadcast to the local hub Make the Broadcast Assist Unit driver use the BAU for TLB shootdowns of cpu's on the local uvhub. It was previously thought that IPI might be faster to the cpu's on the local hub. But the IPI operation would have to follow the completion of the BAU broadcast anyway. So we broadcast to the local uvhub in all cases except when the current cpu was the only local cpu in the mask. This simplifies uv_flush_send_and_wait() in that it returns either all shootdowns complete, or none. Adjust the statistics to account for shootdowns on the local uvhub. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 138 ++++++++++++++++++----------------------------- 1 file changed, 53 insertions(+), 85 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index 4cb14db..a161505 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -400,10 +400,7 @@ static int uv_wait_completion(struct bau_desc *bau_desc, unsigned long mmr_offset, int right_shift, int this_cpu, struct bau_control *bcp, struct bau_control *smaster, long try) { - int relaxes = 0; unsigned long descriptor_status; - unsigned long mmr; - unsigned long mask; cycles_t ttime; struct ptc_stats *stat = bcp->statp; struct bau_control *hmaster; @@ -524,25 +521,19 @@ disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat) * The flush_mask contains the cpus the broadcast is to be sent to, plus * cpus that are on the local uvhub. * - * Returns NULL if all flushing represented in the mask was done. The mask - * is zeroed. - * Returns @flush_mask if some remote flushing remains to be done. The - * mask will have some bits still set, representing any cpus on the local - * uvhub (not current cpu) and any on remote uvhubs if the broadcast failed. + * Returns 0 if all flushing represented in the mask was done. + * Returns 1 if it gives up entirely and the original cpu mask is to be + * returned to the kernel. */ -const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, - struct cpumask *flush_mask, - struct bau_control *bcp) +int uv_flush_send_and_wait(struct bau_desc *bau_desc, + struct cpumask *flush_mask, struct bau_control *bcp) { int right_shift; - int uvhub; - int bit; int completion_status = 0; int seq_number = 0; long try = 0; int cpu = bcp->uvhub_cpu; int this_cpu = bcp->cpu; - int this_uvhub = bcp->uvhub; unsigned long mmr_offset; unsigned long index; cycles_t time1; @@ -552,10 +543,6 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, struct bau_control *smaster = bcp->socket_master; struct bau_control *hmaster = bcp->uvhub_master; - /* - * Spin here while there are hmaster->max_bau_concurrent or more active - * descriptors. This is the per-uvhub 'throttle'. - */ if (!atomic_inc_unless_ge(&hmaster->uvhub_lock, &hmaster->active_descriptor_count, hmaster->max_bau_concurrent)) { @@ -591,9 +578,7 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, index = (1UL << UVH_LB_BAU_SB_ACTIVATION_CONTROL_PUSH_SHFT) | bcp->uvhub_cpu; bcp->send_message = get_cycles(); - uv_write_local_mmr(UVH_LB_BAU_SB_ACTIVATION_CONTROL, index); - try++; completion_status = uv_wait_completion(bau_desc, mmr_offset, right_shift, this_cpu, bcp, smaster, try); @@ -652,16 +637,9 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, (hmaster->max_bau_concurrent < hmaster->max_bau_concurrent_constant)) hmaster->max_bau_concurrent++; - - /* - * hold any cpu not timing out here; no other cpu currently held by - * the 'throttle' should enter the activation code - */ while (hmaster->uvhub_quiesce) cpu_relax(); atomic_dec(&hmaster->active_descriptor_count); - - /* guard against cycles wrap */ if (time2 > time1) { elapsed = time2 - time1; stat->s_time += elapsed; @@ -674,32 +652,14 @@ const struct cpumask *uv_flush_send_and_wait(struct bau_desc *bau_desc, } } } else - stat->s_requestor--; /* don't count this one */ + stat->s_requestor--; if (completion_status == FLUSH_COMPLETE && try > 1) stat->s_retriesok++; else if (completion_status == FLUSH_GIVEUP) { - /* - * Cause the caller to do an IPI-style TLB shootdown on - * the target cpu's, all of which are still in the mask. - */ stat->s_giveup++; - return flush_mask; + return 1; } - - /* - * Success, so clear the remote cpu's from the mask so we don't - * use the IPI method of shootdown on them. - */ - for_each_cpu(bit, flush_mask) { - uvhub = uv_cpu_to_blade_id(bit); - if (uvhub == this_uvhub) - continue; - cpumask_clear_cpu(bit, flush_mask); - } - if (!cpumask_empty(flush_mask)) - return flush_mask; - - return NULL; + return 0; } /** @@ -731,10 +691,11 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, struct mm_struct *mm, unsigned long va, unsigned int cpu) { - int remotes; int tcpu; int uvhub; int locals = 0; + int remotes = 0; + int hubs = 0; struct bau_desc *bau_desc; struct cpumask *flush_mask; struct ptc_stats *stat; @@ -768,54 +729,52 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, /* * Each sending cpu has a per-cpu mask which it fills from the caller's - * cpu mask. Only remote cpus are converted to uvhubs and copied. + * cpu mask. All cpus are converted to uvhubs and copied to the + * activation descriptor. */ flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu); - /* - * copy cpumask to flush_mask, removing current cpu - * (current cpu should already have been flushed by the caller and - * should never be returned if we return flush_mask) - */ + /* don't actually do a shootdown of the local cpu */ cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu)); if (cpu_isset(cpu, *cpumask)) - locals++; /* current cpu was targeted */ + stat->s_ntargself++; bau_desc = bcp->descriptor_base; bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); - remotes = 0; + + /* cpu statistics */ for_each_cpu(tcpu, flush_mask) { uvhub = uv_cpu_to_blade_id(tcpu); - if (uvhub == bcp->uvhub) { - locals++; - continue; - } bau_uvhub_set(uvhub, &bau_desc->distribution); - remotes++; - } - if (remotes == 0) { - /* - * No off_hub flushing; return status for local hub. - * Return the caller's mask if all were local (the current - * cpu may be in that mask). - */ - if (locals) - return cpumask; + if (uvhub == bcp->uvhub) + locals++; else - return NULL; + remotes++; } + if ((locals + remotes) == 0) + return NULL; stat->s_requestor++; - stat->s_ntargcpu += remotes; + stat->s_ntargcpu += remotes + locals; + stat->s_ntargremotes += remotes; + stat->s_ntarglocals += locals; remotes = bau_uvhub_weight(&bau_desc->distribution); - stat->s_ntarguvhub += remotes; - if (remotes >= 16) + + /* uvhub statistics */ + hubs = bau_uvhub_weight(&bau_desc->distribution); + if (locals) { + stat->s_ntarglocaluvhub++; + stat->s_ntargremoteuvhub += (hubs - 1); + } else + stat->s_ntargremoteuvhub += hubs; + stat->s_ntarguvhub += hubs; + if (hubs >= 16) stat->s_ntarguvhub16++; - else if (remotes >= 8) + else if (hubs >= 8) stat->s_ntarguvhub8++; - else if (remotes >= 4) + else if (hubs >= 4) stat->s_ntarguvhub4++; - else if (remotes >= 2) + else if (hubs >= 2) stat->s_ntarguvhub2++; else stat->s_ntarguvhub1++; @@ -824,10 +783,13 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, bau_desc->payload.sending_cpu = cpu; /* - * uv_flush_send_and_wait returns null if all cpu's were messaged, or - * the adjusted flush_mask if any cpu's were not messaged. + * uv_flush_send_and_wait returns 0 if all cpu's were messaged, + * or 1 if it gave up and the original cpumask should be returned. */ - return uv_flush_send_and_wait(bau_desc, flush_mask, bcp); + if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp)) + return NULL; + else + return cpumask; } /* @@ -976,9 +938,11 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) if (!cpu) { seq_printf(file, - "# cpu sent stime numuvhubs numuvhubs16 numuvhubs8 "); + "# cpu sent stime self locals remotes ncpus localhub "); + seq_printf(file, + "remotehub numuvhubs numuvhubs16 numuvhubs8 "); seq_printf(file, - "numuvhubs4 numuvhubs2 numuvhubs1 numcpus dto "); + "numuvhubs4 numuvhubs2 numuvhubs1 dto "); seq_printf(file, "retries rok resetp resett giveup sto bz throt "); seq_printf(file, @@ -994,10 +958,14 @@ static int uv_ptc_seq_show(struct seq_file *file, void *data) seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ", cpu, stat->s_requestor, cycles_2_us(stat->s_time), - stat->s_ntarguvhub, stat->s_ntarguvhub16, + stat->s_ntargself, stat->s_ntarglocals, + stat->s_ntargremotes, stat->s_ntargcpu, + stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub, + stat->s_ntarguvhub, stat->s_ntarguvhub16); + seq_printf(file, "%ld %ld %ld %ld %ld ", stat->s_ntarguvhub8, stat->s_ntarguvhub4, stat->s_ntarguvhub2, stat->s_ntarguvhub1, - stat->s_ntargcpu, stat->s_dtimeout); + stat->s_dtimeout); seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ", stat->s_retry_messages, stat->s_retriesok, stat->s_resets_plug, stat->s_resets_timeout, -- cgit v1.1 From f6d8a56693426b1f29ff5cafda8be0d65e4e1870 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Wed, 2 Jun 2010 16:22:02 -0500 Subject: x86, UV: Modularize BAU send and wait Streamline the large uv_flush_send_and_wait() function by use of a couple of helper functions. And remove some excess comments. Signed-off-by: Cliff Wickman Cc: gregkh@suse.de LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 82 ++++++++++++++++++++++++++---------------------- 1 file changed, 44 insertions(+), 38 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index a161505..abf3c31 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -485,6 +485,47 @@ static inline int atomic_inc_unless_ge(spinlock_t *lock, atomic_t *v, int u) } /* + * Our retries are blocked by all destination swack resources being + * in use, and a timeout is pending. In that case hardware immediately + * returns the ERROR that looks like a destination timeout. + */ +static void +destination_plugged(struct bau_desc *bau_desc, struct bau_control *bcp, + struct bau_control *hmaster, struct ptc_stats *stat) +{ + udelay(bcp->plugged_delay); + bcp->plugged_tries++; + if (bcp->plugged_tries >= bcp->plugsb4reset) { + bcp->plugged_tries = 0; + quiesce_local_uvhub(hmaster); + spin_lock(&hmaster->queue_lock); + uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); + spin_unlock(&hmaster->queue_lock); + end_uvhub_quiesce(hmaster); + bcp->ipi_attempts++; + stat->s_resets_plug++; + } +} + +static void +destination_timeout(struct bau_desc *bau_desc, struct bau_control *bcp, + struct bau_control *hmaster, struct ptc_stats *stat) +{ + hmaster->max_bau_concurrent = 1; + bcp->timeout_tries++; + if (bcp->timeout_tries >= bcp->timeoutsb4reset) { + bcp->timeout_tries = 0; + quiesce_local_uvhub(hmaster); + spin_lock(&hmaster->queue_lock); + uv_reset_with_ipi(&bau_desc->distribution, bcp->cpu); + spin_unlock(&hmaster->queue_lock); + end_uvhub_quiesce(hmaster); + bcp->ipi_attempts++; + stat->s_resets_timeout++; + } +} + +/* * Completions are taking a very long time due to a congested numalink * network. */ @@ -518,7 +559,7 @@ disable_for_congestion(struct bau_control *bcp, struct ptc_stats *stat) * * Send a broadcast and wait for it to complete. * - * The flush_mask contains the cpus the broadcast is to be sent to, plus + * The flush_mask contains the cpus the broadcast is to be sent to including * cpus that are on the local uvhub. * * Returns 0 if all flushing represented in the mask was done. @@ -553,7 +594,6 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, &hmaster->active_descriptor_count, hmaster->max_bau_concurrent)); } - while (hmaster->uvhub_quiesce) cpu_relax(); @@ -584,40 +624,9 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, right_shift, this_cpu, bcp, smaster, try); if (completion_status == FLUSH_RETRY_PLUGGED) { - /* - * Our retries may be blocked by all destination swack - * resources being consumed, and a timeout pending. In - * that case hardware immediately returns the ERROR - * that looks like a destination timeout. - */ - udelay(bcp->plugged_delay); - bcp->plugged_tries++; - if (bcp->plugged_tries >= bcp->plugsb4reset) { - bcp->plugged_tries = 0; - quiesce_local_uvhub(hmaster); - spin_lock(&hmaster->queue_lock); - uv_reset_with_ipi(&bau_desc->distribution, - this_cpu); - spin_unlock(&hmaster->queue_lock); - end_uvhub_quiesce(hmaster); - bcp->ipi_attempts++; - stat->s_resets_plug++; - } + destination_plugged(bau_desc, bcp, hmaster, stat); } else if (completion_status == FLUSH_RETRY_TIMEOUT) { - hmaster->max_bau_concurrent = 1; - bcp->timeout_tries++; - udelay(TIMEOUT_DELAY); - if (bcp->timeout_tries >= bcp->timeoutsb4reset) { - bcp->timeout_tries = 0; - quiesce_local_uvhub(hmaster); - spin_lock(&hmaster->queue_lock); - uv_reset_with_ipi(&bau_desc->distribution, - this_cpu); - spin_unlock(&hmaster->queue_lock); - end_uvhub_quiesce(hmaster); - bcp->ipi_attempts++; - stat->s_resets_timeout++; - } + destination_timeout(bau_desc, bcp, hmaster, stat); } if (bcp->ipi_attempts >= bcp->ipi_reset_limit) { bcp->ipi_attempts = 0; @@ -628,10 +637,8 @@ int uv_flush_send_and_wait(struct bau_desc *bau_desc, } while ((completion_status == FLUSH_RETRY_PLUGGED) || (completion_status == FLUSH_RETRY_TIMEOUT)); time2 = get_cycles(); - bcp->plugged_tries = 0; bcp->timeout_tries = 0; - if ((completion_status == FLUSH_COMPLETE) && (bcp->conseccompletes > bcp->complete_threshold) && (hmaster->max_bau_concurrent < @@ -740,7 +747,6 @@ const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask, bau_desc = bcp->descriptor_base; bau_desc += UV_ITEMS_PER_DESCRIPTOR * bcp->uvhub_cpu; - bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE); /* cpu statistics */ -- cgit v1.1 From 93a7ca0c3ebe5d931126f1fb732cb9c4518383d4 Mon Sep 17 00:00:00 2001 From: Cliff Wickman Date: Fri, 16 Jul 2010 10:11:21 -0500 Subject: x86, UV: Initialize BAU MMRs only on hubs with cpus Remove the initialization of MMRs UVH_LB_BAU_SB_ACTIVATION_CONTROL and UVH_BAU_DATA_BROADCAST on UV hubs that have no active cpus. Such initialization on hubs with no active cpus would result in a kernel page fault. This is not of real high priority, because we don't have any such systems (with UV hubs that have no active cpus). But they will be coming. Signed-off-by: Cliff Wickman LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tlb_uv.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) (limited to 'arch/x86/kernel') diff --git a/arch/x86/kernel/tlb_uv.c b/arch/x86/kernel/tlb_uv.c index abf3c31..59efb53 100644 --- a/arch/x86/kernel/tlb_uv.c +++ b/arch/x86/kernel/tlb_uv.c @@ -1635,12 +1635,16 @@ static int __init uv_bau_init(void) alloc_intr_gate(vector, uv_bau_message_intr1); for_each_possible_blade(uvhub) { - pnode = uv_blade_to_pnode(uvhub); - /* INIT the bau */ - uv_write_global_mmr64(pnode, UVH_LB_BAU_SB_ACTIVATION_CONTROL, - ((unsigned long)1 << 63)); - mmr = 1; /* should be 1 to broadcast to both sockets */ - uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, mmr); + if (uv_blade_nr_possible_cpus(uvhub)) { + pnode = uv_blade_to_pnode(uvhub); + /* INIT the bau */ + uv_write_global_mmr64(pnode, + UVH_LB_BAU_SB_ACTIVATION_CONTROL, + ((unsigned long)1 << 63)); + mmr = 1; /* should be 1 to broadcast to both sockets */ + uv_write_global_mmr64(pnode, UVH_BAU_DATA_BROADCAST, + mmr); + } } return 0; -- cgit v1.1