From 19c635ab24a1e94a759e82bfb34554a6a0db215e Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Fri, 20 Apr 2018 15:36:17 -0700 Subject: x86/intel_rdt/mba_sc: Enable/disable MBA software controller Currently user does memory bandwidth allocation(MBA) by specifying the bandwidth in percentage via the resctrl schemata file: "/sys/fs/resctrl/schemata" Add a new mount option "mba_MBps" to enable the user to specify MBA in MBps: $mount -t resctrl resctrl [-o cdp[,cdpl2][mba_MBps]] /sys/fs/resctrl Signed-off-by: Vikas Shivappa Signed-off-by: Thomas Gleixner Cc: ravi.v.shankar@intel.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: vikas.shivappa@intel.com Cc: ak@linux.intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/1524263781-14267-3-git-send-email-vikas.shivappa@linux.intel.com --- arch/x86/kernel/cpu/intel_rdt.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/x86/kernel/cpu/intel_rdt.c') diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 589b948..53ee683 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -230,6 +230,14 @@ static inline void cache_alloc_hsw_probe(void) rdt_alloc_capable = true; } +bool is_mba_sc(struct rdt_resource *r) +{ + if (!r) + return rdt_resources_all[RDT_RESOURCE_MBA].membw.mba_sc; + + return r->membw.mba_sc; +} + /* * rdt_get_mb_table() - get a mapping of bandwidth(b/w) percentage values * exposed to user interface and the h/w understandable delay values. -- cgit v1.1 From 1bd2a63b4f0deefe745aa0fd969c07b2eb9ee99e Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Fri, 20 Apr 2018 15:36:18 -0700 Subject: x86/intel_rdt/mba_sc: Add initialization support When MBA software controller is enabled, a per domain storage is required for user specified bandwidth in "MBps" and the "percentage" values which are programmed into the IA32_MBA_THRTL_MSR. Add support for these data structures and initialization. The MBA percentage values have a default max value of 100 but however the max value in MBps is not available from the hardware so it's set to U32_MAX. This simply says that the control group can use all bandwidth by default but does not say what is the actual max bandwidth available. The actual bandwidth that is available may depend on lot of factors like QPI link, number of memory channels, memory channel frequency, its width and memory speed, how many channels are configured and also if memory interleaving is enabled. So there is no way to determine the maximum at runtime reliably. Signed-off-by: Vikas Shivappa Signed-off-by: Thomas Gleixner Cc: ravi.v.shankar@intel.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: vikas.shivappa@intel.com Cc: ak@linux.intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/1524263781-14267-4-git-send-email-vikas.shivappa@linux.intel.com --- arch/x86/kernel/cpu/intel_rdt.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'arch/x86/kernel/cpu/intel_rdt.c') diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 53ee683..8c09e9d 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -35,6 +35,7 @@ #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 +#define MBA_MAX_MBPS U32_MAX /* Mutex to protect rdtgroup access. */ DEFINE_MUTEX(rdtgroup_mutex); @@ -439,25 +440,40 @@ struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, return NULL; } +void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm) +{ + int i; + + /* + * Initialize the Control MSRs to having no control. + * For Cache Allocation: Set all bits in cbm + * For Memory Allocation: Set b/w requested to 100% + * and the bandwidth in MBps to U32_MAX + */ + for (i = 0; i < r->num_closid; i++, dc++, dm++) { + *dc = r->default_ctrl; + *dm = MBA_MAX_MBPS; + } +} + static int domain_setup_ctrlval(struct rdt_resource *r, struct rdt_domain *d) { struct msr_param m; - u32 *dc; - int i; + u32 *dc, *dm; dc = kmalloc_array(r->num_closid, sizeof(*d->ctrl_val), GFP_KERNEL); if (!dc) return -ENOMEM; - d->ctrl_val = dc; + dm = kmalloc_array(r->num_closid, sizeof(*d->mbps_val), GFP_KERNEL); + if (!dm) { + kfree(dc); + return -ENOMEM; + } - /* - * Initialize the Control MSRs to having no control. - * For Cache Allocation: Set all bits in cbm - * For Memory Allocation: Set b/w requested to 100 - */ - for (i = 0; i < r->num_closid; i++, dc++) - *dc = r->default_ctrl; + d->ctrl_val = dc; + d->mbps_val = dm; + setup_default_ctrlval(r, dc, dm); m.low = 0; m.high = r->num_closid; @@ -596,6 +612,7 @@ static void domain_remove_cpu(int cpu, struct rdt_resource *r) } kfree(d->ctrl_val); + kfree(d->mbps_val); kfree(d->rmid_busy_llc); kfree(d->mbm_total); kfree(d->mbm_local); -- cgit v1.1 From 8205a078ba7819c23558e31af4b3bda04d9b3bae Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Fri, 20 Apr 2018 15:36:19 -0700 Subject: x86/intel_rdt/mba_sc: Add schemata support Currently when user updates the "schemata" with new MBA percentage values, kernel writes the corresponding bandwidth percentage values to the IA32_MBA_THRTL_MSR. When MBA is expressed in MBps, the schemata format is changed to have the per package memory bandwidth in MBps instead of being specified in percentage. Do not write the IA32_MBA_THRTL_MSRs when the schemata is updated as that is handled separately. Signed-off-by: Vikas Shivappa Signed-off-by: Thomas Gleixner Cc: ravi.v.shankar@intel.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: vikas.shivappa@intel.com Cc: ak@linux.intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/1524263781-14267-5-git-send-email-vikas.shivappa@linux.intel.com --- arch/x86/kernel/cpu/intel_rdt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'arch/x86/kernel/cpu/intel_rdt.c') diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index 8c09e9d..ad03d97 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -179,7 +179,7 @@ struct rdt_resource rdt_resources_all[] = { .msr_update = mba_wrmsr, .cache_level = 3, .parse_ctrlval = parse_bw, - .format_str = "%d=%*d", + .format_str = "%d=%*u", .fflags = RFTYPE_RES_MB, }, }; -- cgit v1.1 From de73f38f768021610bd305cf74ef3702fcf6a1eb Mon Sep 17 00:00:00 2001 From: Vikas Shivappa Date: Fri, 20 Apr 2018 15:36:21 -0700 Subject: x86/intel_rdt/mba_sc: Feedback loop to dynamically update mem bandwidth mba_sc is a feedback loop where we periodically read MBM counters and try to restrict the bandwidth below a max value so the below is always true: "current bandwidth(cur_bw) < user specified bandwidth(user_bw)" The frequency of these checks is currently 1s and we just tag along the MBM overflow timer to do the updates. Doing it once in a second also makes the calculation of bandwidth easy. The steps of increase or decrease of bandwidth is the minimum granularity specified by the hardware. Although the MBA's goal is to restrict the bandwidth below a maximum, there may be a need to even increase the bandwidth. Since MBA controls the L2 external bandwidth where as MBM measures the L3 external bandwidth, we may end up restricting some rdtgroups unnecessarily. This may happen in the sequence where rdtgroup (set of jobs) had high "L3 <-> memory traffic" in initial phases -> mba_sc kicks in and reduced bandwidth percentage values -> but after some it has mostly "L2 <-> L3" traffic. In this scenario mba_sc increases the bandwidth percentage when there is lesser memory traffic. Signed-off-by: Vikas Shivappa Signed-off-by: Thomas Gleixner Cc: ravi.v.shankar@intel.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: vikas.shivappa@intel.com Cc: ak@linux.intel.com Cc: hpa@zytor.com Link: https://lkml.kernel.org/r/1524263781-14267-7-git-send-email-vikas.shivappa@linux.intel.com --- arch/x86/kernel/cpu/intel_rdt.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'arch/x86/kernel/cpu/intel_rdt.c') diff --git a/arch/x86/kernel/cpu/intel_rdt.c b/arch/x86/kernel/cpu/intel_rdt.c index ad03d97..24bfa63 100644 --- a/arch/x86/kernel/cpu/intel_rdt.c +++ b/arch/x86/kernel/cpu/intel_rdt.c @@ -33,7 +33,6 @@ #include #include "intel_rdt.h" -#define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 #define MBA_MAX_MBPS U32_MAX @@ -350,7 +349,7 @@ static int get_cache_id(int cpu, int level) * that can be written to QOS_MSRs. * There are currently no SKUs which support non linear delay values. */ -static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) +u32 delay_bw_map(unsigned long bw, struct rdt_resource *r) { if (r->membw.delay_linear) return MAX_MBA_BW - bw; -- cgit v1.1