summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--arch/i386/Kconfig9
-rw-r--r--arch/i386/kernel/cpu/common.c10
-rw-r--r--arch/i386/kernel/cpu/intel_cacheinfo.c22
-rw-r--r--arch/i386/kernel/smpboot.c24
-rw-r--r--arch/x86_64/Kconfig9
-rw-r--r--arch/x86_64/kernel/setup.c3
-rw-r--r--arch/x86_64/kernel/smpboot.c24
-rw-r--r--include/asm-i386/processor.h5
-rw-r--r--include/asm-i386/topology.h2
-rw-r--r--include/asm-x86_64/processor.h4
-rw-r--r--include/asm-x86_64/smp.h1
-rw-r--r--include/asm-x86_64/topology.h2
-rw-r--r--include/linux/topology.h9
-rw-r--r--kernel/sched.c73
14 files changed, 186 insertions, 11 deletions
diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig
index f7db71d..f17bd1d 100644
--- a/arch/i386/Kconfig
+++ b/arch/i386/Kconfig
@@ -231,6 +231,15 @@ config SCHED_SMT
cost of slightly increased overhead in some places. If unsure say
N here.
+config SCHED_MC
+ bool "Multi-core scheduler support"
+ depends on SMP
+ default y
+ help
+ Multi-core scheduler support improves the CPU scheduler's decision
+ making when dealing with multi-core CPU chips at a cost of slightly
+ increased overhead in some places. If unsure say N here.
+
source "kernel/Kconfig.preempt"
config X86_UP_APIC
diff --git a/arch/i386/kernel/cpu/common.c b/arch/i386/kernel/cpu/common.c
index 7e3d6b6..a06a490 100644
--- a/arch/i386/kernel/cpu/common.c
+++ b/arch/i386/kernel/cpu/common.c
@@ -266,7 +266,7 @@ static void __init early_cpu_detect(void)
void __cpuinit generic_identify(struct cpuinfo_x86 * c)
{
u32 tfms, xlvl;
- int junk;
+ int ebx;
if (have_cpuid_p()) {
/* Get vendor name */
@@ -282,7 +282,7 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
/* Intel-defined flags: level 0x00000001 */
if ( c->cpuid_level >= 0x00000001 ) {
u32 capability, excap;
- cpuid(0x00000001, &tfms, &junk, &excap, &capability);
+ cpuid(0x00000001, &tfms, &ebx, &excap, &capability);
c->x86_capability[0] = capability;
c->x86_capability[4] = excap;
c->x86 = (tfms >> 8) & 15;
@@ -292,6 +292,11 @@ void __cpuinit generic_identify(struct cpuinfo_x86 * c)
if (c->x86 >= 0x6)
c->x86_model += ((tfms >> 16) & 0xF) << 4;
c->x86_mask = tfms & 15;
+#ifdef CONFIG_SMP
+ c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
+#else
+ c->apicid = (ebx >> 24) & 0xFF;
+#endif
} else {
/* Have CPUID level 0 only - unheard of */
c->x86 = 4;
@@ -474,7 +479,6 @@ void __cpuinit detect_ht(struct cpuinfo_x86 *c)
cpuid(1, &eax, &ebx, &ecx, &edx);
- c->apicid = phys_pkg_id((ebx >> 24) & 0xFF, 0);
if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
return;
diff --git a/arch/i386/kernel/cpu/intel_cacheinfo.c b/arch/i386/kernel/cpu/intel_cacheinfo.c
index ce61921..7e7fd4e 100644
--- a/arch/i386/kernel/cpu/intel_cacheinfo.c
+++ b/arch/i386/kernel/cpu/intel_cacheinfo.c
@@ -173,6 +173,10 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
unsigned int trace = 0, l1i = 0, l1d = 0, l2 = 0, l3 = 0; /* Cache sizes */
unsigned int new_l1d = 0, new_l1i = 0; /* Cache sizes from cpuid(4) */
unsigned int new_l2 = 0, new_l3 = 0, i; /* Cache sizes from cpuid(4) */
+ unsigned int l2_id = 0, l3_id = 0, num_threads_sharing, index_msb;
+#ifdef CONFIG_SMP
+ unsigned int cpu = (c == &boot_cpu_data) ? 0 : (c - cpu_data);
+#endif
if (c->cpuid_level > 3) {
static int is_initialized;
@@ -205,9 +209,15 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
break;
case 2:
new_l2 = this_leaf.size/1024;
+ num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+ l2_id = c->apicid >> index_msb;
break;
case 3:
new_l3 = this_leaf.size/1024;
+ num_threads_sharing = 1 + this_leaf.eax.split.num_threads_sharing;
+ index_msb = get_count_order(num_threads_sharing);
+ l3_id = c->apicid >> index_msb;
break;
default:
break;
@@ -273,11 +283,19 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
if (new_l1i)
l1i = new_l1i;
- if (new_l2)
+ if (new_l2) {
l2 = new_l2;
+#ifdef CONFIG_SMP
+ cpu_llc_id[cpu] = l2_id;
+#endif
+ }
- if (new_l3)
+ if (new_l3) {
l3 = new_l3;
+#ifdef CONFIG_SMP
+ cpu_llc_id[cpu] = l3_id;
+#endif
+ }
if ( trace )
printk (KERN_INFO "CPU: Trace cache: %dK uops", trace);
diff --git a/arch/i386/kernel/smpboot.c b/arch/i386/kernel/smpboot.c
index 82371d8..a696990 100644
--- a/arch/i386/kernel/smpboot.c
+++ b/arch/i386/kernel/smpboot.c
@@ -72,6 +72,9 @@ int phys_proc_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
/* Core ID of each logical CPU */
int cpu_core_id[NR_CPUS] __read_mostly = {[0 ... NR_CPUS-1] = BAD_APICID};
+/* Last level cache ID of each logical CPU */
+int cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
+
/* representing HT siblings of each logical CPU */
cpumask_t cpu_sibling_map[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(cpu_sibling_map);
@@ -440,6 +443,18 @@ static void __devinit smp_callin(void)
static int cpucount;
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+ struct cpuinfo_x86 *c = cpu_data + cpu;
+ /*
+ * For perf, we return last level cache shared map.
+ * TBD: when power saving sched policy is added, we will return
+ * cpu_core_map when power saving policy is enabled
+ */
+ return c->llc_shared_map;
+}
+
/* representing cpus for which sibling maps can be computed */
static cpumask_t cpu_sibling_setup_map;
@@ -459,12 +474,16 @@ set_cpu_sibling_map(int cpu)
cpu_set(cpu, cpu_sibling_map[i]);
cpu_set(i, cpu_core_map[cpu]);
cpu_set(cpu, cpu_core_map[i]);
+ cpu_set(i, c[cpu].llc_shared_map);
+ cpu_set(cpu, c[i].llc_shared_map);
}
}
} else {
cpu_set(cpu, cpu_sibling_map[cpu]);
}
+ cpu_set(cpu, c[cpu].llc_shared_map);
+
if (current_cpu_data.x86_max_cores == 1) {
cpu_core_map[cpu] = cpu_sibling_map[cpu];
c[cpu].booted_cores = 1;
@@ -472,6 +491,11 @@ set_cpu_sibling_map(int cpu)
}
for_each_cpu_mask(i, cpu_sibling_setup_map) {
+ if (cpu_llc_id[cpu] != BAD_APICID &&
+ cpu_llc_id[cpu] == cpu_llc_id[i]) {
+ cpu_set(i, c[cpu].llc_shared_map);
+ cpu_set(cpu, c[i].llc_shared_map);
+ }
if (phys_proc_id[cpu] == phys_proc_id[i]) {
cpu_set(i, cpu_core_map[cpu]);
cpu_set(cpu, cpu_core_map[i]);
diff --git a/arch/x86_64/Kconfig b/arch/x86_64/Kconfig
index 45efe0c..1cb4aa2 100644
--- a/arch/x86_64/Kconfig
+++ b/arch/x86_64/Kconfig
@@ -250,6 +250,15 @@ config SCHED_SMT
cost of slightly increased overhead in some places. If unsure say
N here.
+config SCHED_MC
+ bool "Multi-core scheduler support"
+ depends on SMP
+ default y
+ help
+ Multi-core scheduler support improves the CPU scheduler's decision
+ making when dealing with multi-core CPU chips at a cost of slightly
+ increased overhead in some places. If unsure say N here.
+
source "kernel/Kconfig.preempt"
config NUMA
diff --git a/arch/x86_64/kernel/setup.c b/arch/x86_64/kernel/setup.c
index a57eec8..d1f3e92 100644
--- a/arch/x86_64/kernel/setup.c
+++ b/arch/x86_64/kernel/setup.c
@@ -962,7 +962,6 @@ static void __cpuinit detect_ht(struct cpuinfo_x86 *c)
cpuid(1, &eax, &ebx, &ecx, &edx);
- c->apicid = phys_pkg_id(0);
if (!cpu_has(c, X86_FEATURE_HT) || cpu_has(c, X86_FEATURE_CMP_LEGACY))
return;
@@ -1171,6 +1170,8 @@ void __cpuinit identify_cpu(struct cpuinfo_x86 *c)
c->x86_capability[2] = cpuid_edx(0x80860001);
}
+ c->apicid = phys_pkg_id(0);
+
/*
* Vendor-specific initialization. In this section we
* canonicalize the feature flags, meaning if there are
diff --git a/arch/x86_64/kernel/smpboot.c b/arch/x86_64/kernel/smpboot.c
index 66e9865..ea48fa6 100644
--- a/arch/x86_64/kernel/smpboot.c
+++ b/arch/x86_64/kernel/smpboot.c
@@ -68,6 +68,9 @@ u8 phys_proc_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
/* core ID of each logical CPU */
u8 cpu_core_id[NR_CPUS] __read_mostly = { [0 ... NR_CPUS-1] = BAD_APICID };
+/* Last level cache ID of each logical CPU */
+u8 cpu_llc_id[NR_CPUS] __cpuinitdata = {[0 ... NR_CPUS-1] = BAD_APICID};
+
/* Bitmask of currently online CPUs */
cpumask_t cpu_online_map __read_mostly;
@@ -445,6 +448,18 @@ void __cpuinit smp_callin(void)
cpu_set(cpuid, cpu_callin_map);
}
+/* maps the cpu to the sched domain representing multi-core */
+cpumask_t cpu_coregroup_map(int cpu)
+{
+ struct cpuinfo_x86 *c = cpu_data + cpu;
+ /*
+ * For perf, we return last level cache shared map.
+ * TBD: when power saving sched policy is added, we will return
+ * cpu_core_map when power saving policy is enabled
+ */
+ return c->llc_shared_map;
+}
+
/* representing cpus for which sibling maps can be computed */
static cpumask_t cpu_sibling_setup_map;
@@ -463,12 +478,16 @@ static inline void set_cpu_sibling_map(int cpu)
cpu_set(cpu, cpu_sibling_map[i]);
cpu_set(i, cpu_core_map[cpu]);
cpu_set(cpu, cpu_core_map[i]);
+ cpu_set(i, c[cpu].llc_shared_map);
+ cpu_set(cpu, c[i].llc_shared_map);
}
}
} else {
cpu_set(cpu, cpu_sibling_map[cpu]);
}
+ cpu_set(cpu, c[cpu].llc_shared_map);
+
if (current_cpu_data.x86_max_cores == 1) {
cpu_core_map[cpu] = cpu_sibling_map[cpu];
c[cpu].booted_cores = 1;
@@ -476,6 +495,11 @@ static inline void set_cpu_sibling_map(int cpu)
}
for_each_cpu_mask(i, cpu_sibling_setup_map) {
+ if (cpu_llc_id[cpu] != BAD_APICID &&
+ cpu_llc_id[cpu] == cpu_llc_id[i]) {
+ cpu_set(i, c[cpu].llc_shared_map);
+ cpu_set(cpu, c[i].llc_shared_map);
+ }
if (phys_proc_id[cpu] == phys_proc_id[i]) {
cpu_set(i, cpu_core_map[cpu]);
cpu_set(cpu, cpu_core_map[i]);
diff --git a/include/asm-i386/processor.h b/include/asm-i386/processor.h
index feca5d9..af4bfd0 100644
--- a/include/asm-i386/processor.h
+++ b/include/asm-i386/processor.h
@@ -20,6 +20,7 @@
#include <linux/config.h>
#include <linux/threads.h>
#include <asm/percpu.h>
+#include <linux/cpumask.h>
/* flag for disabling the tsc */
extern int tsc_disable;
@@ -67,6 +68,9 @@ struct cpuinfo_x86 {
char pad0;
int x86_power;
unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+ cpumask_t llc_shared_map; /* cpus sharing the last level cache */
+#endif
unsigned char x86_max_cores; /* cpuid returned max cores value */
unsigned char booted_cores; /* number of cores as seen by OS */
unsigned char apicid;
@@ -103,6 +107,7 @@ extern struct cpuinfo_x86 cpu_data[];
extern int phys_proc_id[NR_CPUS];
extern int cpu_core_id[NR_CPUS];
+extern int cpu_llc_id[NR_CPUS];
extern char ignore_fpu_irq;
extern void identify_cpu(struct cpuinfo_x86 *);
diff --git a/include/asm-i386/topology.h b/include/asm-i386/topology.h
index aa958c6..b94e5ee 100644
--- a/include/asm-i386/topology.h
+++ b/include/asm-i386/topology.h
@@ -112,4 +112,6 @@ extern unsigned long node_remap_size[];
#endif /* CONFIG_NUMA */
+extern cpumask_t cpu_coregroup_map(int cpu);
+
#endif /* _ASM_I386_TOPOLOGY_H */
diff --git a/include/asm-x86_64/processor.h b/include/asm-x86_64/processor.h
index 8c8d88c..1aa2cee 100644
--- a/include/asm-x86_64/processor.h
+++ b/include/asm-x86_64/processor.h
@@ -20,6 +20,7 @@
#include <asm/mmsegment.h>
#include <asm/percpu.h>
#include <linux/personality.h>
+#include <linux/cpumask.h>
#define TF_MASK 0x00000100
#define IF_MASK 0x00000200
@@ -65,6 +66,9 @@ struct cpuinfo_x86 {
__u32 x86_power;
__u32 extended_cpuid_level; /* Max extended CPUID function supported */
unsigned long loops_per_jiffy;
+#ifdef CONFIG_SMP
+ cpumask_t llc_shared_map; /* cpus sharing the last level cache */
+#endif
__u8 apicid;
__u8 booted_cores; /* number of cores as seen by OS */
} ____cacheline_aligned;
diff --git a/include/asm-x86_64/smp.h b/include/asm-x86_64/smp.h
index 9ccbb2c..a4fdaeb 100644
--- a/include/asm-x86_64/smp.h
+++ b/include/asm-x86_64/smp.h
@@ -56,6 +56,7 @@ extern cpumask_t cpu_sibling_map[NR_CPUS];
extern cpumask_t cpu_core_map[NR_CPUS];
extern u8 phys_proc_id[NR_CPUS];
extern u8 cpu_core_id[NR_CPUS];
+extern u8 cpu_llc_id[NR_CPUS];
#define SMP_TRAMPOLINE_BASE 0x6000
diff --git a/include/asm-x86_64/topology.h b/include/asm-x86_64/topology.h
index c642f5d..9db54e9 100644
--- a/include/asm-x86_64/topology.h
+++ b/include/asm-x86_64/topology.h
@@ -68,4 +68,6 @@ extern int __node_distance(int, int);
#include <asm-generic/topology.h>
+extern cpumask_t cpu_coregroup_map(int cpu);
+
#endif
diff --git a/include/linux/topology.h b/include/linux/topology.h
index e8eb004..a305ae2 100644
--- a/include/linux/topology.h
+++ b/include/linux/topology.h
@@ -164,6 +164,15 @@
.nr_balance_failed = 0, \
}
+#ifdef CONFIG_SCHED_MC
+#ifndef SD_MC_INIT
+/* for now its same as SD_CPU_INIT.
+ * TBD: Tune Domain parameters!
+ */
+#define SD_MC_INIT SD_CPU_INIT
+#endif
+#endif
+
#ifdef CONFIG_NUMA
#ifndef SD_NODE_INIT
#error Please define an appropriate SD_NODE_INIT in include/asm/topology.h!!!
diff --git a/kernel/sched.c b/kernel/sched.c
index a96a05d..8a8b71b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5574,11 +5574,31 @@ static int cpu_to_cpu_group(int cpu)
}
#endif
+#ifdef CONFIG_SCHED_MC
+static DEFINE_PER_CPU(struct sched_domain, core_domains);
+static struct sched_group sched_group_core[NR_CPUS];
+#endif
+
+#if defined(CONFIG_SCHED_MC) && defined(CONFIG_SCHED_SMT)
+static int cpu_to_core_group(int cpu)
+{
+ return first_cpu(cpu_sibling_map[cpu]);
+}
+#elif defined(CONFIG_SCHED_MC)
+static int cpu_to_core_group(int cpu)
+{
+ return cpu;
+}
+#endif
+
static DEFINE_PER_CPU(struct sched_domain, phys_domains);
static struct sched_group sched_group_phys[NR_CPUS];
static int cpu_to_phys_group(int cpu)
{
-#ifdef CONFIG_SCHED_SMT
+#if defined(CONFIG_SCHED_MC)
+ cpumask_t mask = cpu_coregroup_map(cpu);
+ return first_cpu(mask);
+#elif defined(CONFIG_SCHED_SMT)
return first_cpu(cpu_sibling_map[cpu]);
#else
return cpu;
@@ -5676,6 +5696,17 @@ void build_sched_domains(const cpumask_t *cpu_map)
sd->parent = p;
sd->groups = &sched_group_phys[group];
+#ifdef CONFIG_SCHED_MC
+ p = sd;
+ sd = &per_cpu(core_domains, i);
+ group = cpu_to_core_group(i);
+ *sd = SD_MC_INIT;
+ sd->span = cpu_coregroup_map(i);
+ cpus_and(sd->span, sd->span, *cpu_map);
+ sd->parent = p;
+ sd->groups = &sched_group_core[group];
+#endif
+
#ifdef CONFIG_SCHED_SMT
p = sd;
sd = &per_cpu(cpu_domains, i);
@@ -5701,6 +5732,19 @@ void build_sched_domains(const cpumask_t *cpu_map)
}
#endif
+#ifdef CONFIG_SCHED_MC
+ /* Set up multi-core groups */
+ for_each_cpu_mask(i, *cpu_map) {
+ cpumask_t this_core_map = cpu_coregroup_map(i);
+ cpus_and(this_core_map, this_core_map, *cpu_map);
+ if (i != first_cpu(this_core_map))
+ continue;
+ init_sched_build_groups(sched_group_core, this_core_map,
+ &cpu_to_core_group);
+ }
+#endif
+
+
/* Set up physical groups */
for (i = 0; i < MAX_NUMNODES; i++) {
cpumask_t nodemask = node_to_cpumask(i);
@@ -5797,11 +5841,31 @@ void build_sched_domains(const cpumask_t *cpu_map)
power = SCHED_LOAD_SCALE;
sd->groups->cpu_power = power;
#endif
+#ifdef CONFIG_SCHED_MC
+ sd = &per_cpu(core_domains, i);
+ power = SCHED_LOAD_SCALE + (cpus_weight(sd->groups->cpumask)-1)
+ * SCHED_LOAD_SCALE / 10;
+ sd->groups->cpu_power = power;
+
+ sd = &per_cpu(phys_domains, i);
+ /*
+ * This has to be < 2 * SCHED_LOAD_SCALE
+ * Lets keep it SCHED_LOAD_SCALE, so that
+ * while calculating NUMA group's cpu_power
+ * we can simply do
+ * numa_group->cpu_power += phys_group->cpu_power;
+ *
+ * See "only add power once for each physical pkg"
+ * comment below
+ */
+ sd->groups->cpu_power = SCHED_LOAD_SCALE;
+#else
sd = &per_cpu(phys_domains, i);
power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
(cpus_weight(sd->groups->cpumask)-1) / 10;
sd->groups->cpu_power = power;
+#endif
#ifdef CONFIG_NUMA
sd = &per_cpu(allnodes_domains, i);
@@ -5823,7 +5887,6 @@ void build_sched_domains(const cpumask_t *cpu_map)
next_sg:
for_each_cpu_mask(j, sg->cpumask) {
struct sched_domain *sd;
- int power;
sd = &per_cpu(phys_domains, j);
if (j != first_cpu(sd->groups->cpumask)) {
@@ -5833,10 +5896,8 @@ next_sg:
*/
continue;
}
- power = SCHED_LOAD_SCALE + SCHED_LOAD_SCALE *
- (cpus_weight(sd->groups->cpumask)-1) / 10;
- sg->cpu_power += power;
+ sg->cpu_power += sd->groups->cpu_power;
}
sg = sg->next;
if (sg != sched_group_nodes[i])
@@ -5849,6 +5910,8 @@ next_sg:
struct sched_domain *sd;
#ifdef CONFIG_SCHED_SMT
sd = &per_cpu(cpu_domains, i);
+#elif defined(CONFIG_SCHED_MC)
+ sd = &per_cpu(core_domains, i);
#else
sd = &per_cpu(phys_domains, i);
#endif
OpenPOWER on IntegriCloud