summaryrefslogtreecommitdiffstats
path: root/sys/ofed/drivers/net/mlx4/sys_tune.c
blob: 0675e90eacf5c82055c9793f65c39f8ab3e67464 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
/*
 * Copyright (c) 2010 Mellanox Technologies. All rights reserved.
 *
 * This software is available to you under a choice of one of two
 * licenses.  You may choose to be licensed under the terms of the GNU
 * General Public License (GPL) Version 2, available from the file
 * COPYING in the main directory of this source tree, or the
 * OpenIB.org BSD license below:
 *
 *     Redistribution and use in source and binary forms, with or
 *     without modification, are permitted provided that the following
 *     conditions are met:
 *
 *      - Redistributions of source code must retain the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer.
 *
 *      - Redistributions in binary form must reproduce the above
 *        copyright notice, this list of conditions and the following
 *        disclaimer in the documentation and/or other materials
 *        provided with the distribution.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 *
 */

#include <linux/sched.h>
#include <linux/mutex.h>
#include <asm/atomic.h>

#include "mlx4.h"

#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE)



/* Each CPU is put into a group.  In most cases, the group number is
 * equal to the CPU number of one of the CPUs in the group.  The
 * exception is group NR_CPUS which is the default group.  This is
 * protected by sys_tune_startup_mutex. */
DEFINE_PER_CPU(int, idle_cpu_group) = NR_CPUS;

/* For each group, a count of the number of CPUs in the group which
 * are known to be busy.  A busy CPU might be running the busy loop
 * below or general kernel code.  The count is decremented on entry to
 * the old pm_idle handler and incremented on exit.  The aim is to
 * avoid the count going to zero or negative.  This situation can
 * occur temporarily during module unload or CPU hot-plug but
 * normality will be restored when the affected CPUs next exit the
 * idle loop. */
static atomic_t busy_cpu_count[NR_CPUS+1];

/* A workqueue item to be executed to cause the CPU to exit from the
 * idle loop. */
DEFINE_PER_CPU(struct work_struct, sys_tune_cpu_work);

#define sys_tune_set_state(CPU,STATE) \
	do { } while(0)


/* A mutex to protect most of the module datastructures. */
static DEFINE_MUTEX(sys_tune_startup_mutex);

/* The old pm_idle handler. */
static void (*old_pm_idle)(void) = NULL;

static void sys_tune_pm_idle(void)
{
	atomic_t *busy_cpus_ptr;
	int busy_cpus;
	int cpu = smp_processor_id();

	busy_cpus_ptr = &(busy_cpu_count[per_cpu(idle_cpu_group, cpu)]);

	sys_tune_set_state(cpu, 2);

	local_irq_enable();
	while (!need_resched()) {
		busy_cpus = atomic_read(busy_cpus_ptr);

		/* If other CPUs in this group are busy then let this
		 * CPU go idle.  We mustn't let the number of busy
		 * CPUs drop below 1. */
		if ( busy_cpus > 1 &&
		     old_pm_idle != NULL &&
		     ( atomic_cmpxchg(busy_cpus_ptr, busy_cpus,
				      busy_cpus-1) == busy_cpus ) ) {
			local_irq_disable();
			sys_tune_set_state(cpu, 3);
			/* This check might not be necessary, but it
			 * seems safest to include it because there
			 * might be a kernel version which requires
			 * it. */
			if (need_resched())
				local_irq_enable();
			else
				old_pm_idle();
			/* This CPU is busy again. */
			sys_tune_set_state(cpu, 1);
			atomic_add(1, busy_cpus_ptr);
			return;
		}

		cpu_relax();
	}
	sys_tune_set_state(cpu, 0);
}


void sys_tune_work_func(struct work_struct *work)
{
	/* Do nothing.  Since this function is running in process
	 * context, the idle thread isn't running on this CPU. */
}


#ifdef CONFIG_SMP
static void sys_tune_smp_call(void *info)
{
	schedule_work(&get_cpu_var(sys_tune_cpu_work));
	put_cpu_var(sys_tune_cpu_work);
}
#endif


#ifdef CONFIG_SMP
static void sys_tune_refresh(void)
{
#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26)
        on_each_cpu(&sys_tune_smp_call, NULL, 0, 1);
#else
        on_each_cpu(&sys_tune_smp_call, NULL, 1);
#endif
}
#else
static void sys_tune_refresh(void)
{
	/* The current thread is executing on the one and only CPU so
	 * the idle thread isn't running. */
}
#endif



static int sys_tune_cpu_group(int cpu)
{
#ifdef CONFIG_SMP
	const cpumask_t *mask;
	int other_cpu;
	int group;

#if defined(topology_thread_cpumask) && defined(ST_HAVE_EXPORTED_CPU_SIBLING_MAP)
	/* Keep one hyperthread busy per core. */
	mask = topology_thread_cpumask(cpu);
#else
	return cpu;
#endif
	for_each_cpu_mask(cpu, *(mask))	{
		group = per_cpu(idle_cpu_group, other_cpu);
		if (group != NR_CPUS)
			return group;
	}
#endif

	return cpu;
}


static void sys_tune_add_cpu(int cpu)
{
	int group;

	/* Do nothing if this CPU has already been added. */
	if (per_cpu(idle_cpu_group, cpu) != NR_CPUS)
		return;

	group = sys_tune_cpu_group(cpu);
	per_cpu(idle_cpu_group, cpu) = group;
	atomic_inc(&(busy_cpu_count[group]));

}

static void sys_tune_del_cpu(int cpu)
{

	int group;

	if (per_cpu(idle_cpu_group, cpu) == NR_CPUS)
		return;

	group = per_cpu(idle_cpu_group, cpu);
	/* If the CPU was busy, this can cause the count to drop to
	 * zero.  To rectify this, we need to cause one of the other
	 * CPUs in the group to exit the idle loop.  If the CPU was
	 * not busy then this causes the contribution for this CPU to
	 * go to -1 which can cause the overall count to drop to zero
	 * or go negative.  To rectify this situation we need to cause
	 * this CPU to exit the idle loop. */
	atomic_dec(&(busy_cpu_count[group]));
	per_cpu(idle_cpu_group, cpu) = NR_CPUS;

}


static int sys_tune_cpu_notify(struct notifier_block *self,
			       unsigned long action, void *hcpu)
{
	int cpu = (long)hcpu;
	
	switch(action) {
#ifdef CPU_ONLINE_FROZEN
	case CPU_ONLINE_FROZEN:
#endif
	case CPU_ONLINE:
		mutex_lock(&sys_tune_startup_mutex);
		sys_tune_add_cpu(cpu);
		mutex_unlock(&sys_tune_startup_mutex);
		/* The CPU might have already entered the idle loop in
		 * the wrong group.  Make sure it exits the idle loop
		 * so that it picks up the correct group. */
		sys_tune_refresh();
		break;

#ifdef CPU_DEAD_FROZEN
	case CPU_DEAD_FROZEN:
#endif
	case CPU_DEAD:
		mutex_lock(&sys_tune_startup_mutex);
		sys_tune_del_cpu(cpu);
		mutex_unlock(&sys_tune_startup_mutex);
		/* The deleted CPU may have been the only busy CPU in
		 * the group.  Make sure one of the other CPUs in the
		 * group exits the idle loop. */
		sys_tune_refresh();
		break;
	}
	return NOTIFY_OK;
}


static struct notifier_block sys_tune_cpu_nb = {
	.notifier_call = sys_tune_cpu_notify,
};


static void sys_tune_ensure_init(void)
{
	BUG_ON (old_pm_idle != NULL);

	/* Atomically update pm_idle to &sys_tune_pm_idle.  The old value
	 * is stored in old_pm_idle before installing the new
	 * handler. */
	do {
		old_pm_idle = pm_idle;
	} while (cmpxchg(&pm_idle, old_pm_idle, &sys_tune_pm_idle) !=
		 old_pm_idle);
}
#endif

void sys_tune_fini(void)
{
#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE)
	void (*old)(void);
	int cpu;

	unregister_cpu_notifier(&sys_tune_cpu_nb);

	mutex_lock(&sys_tune_startup_mutex);


	old = cmpxchg(&pm_idle, &sys_tune_pm_idle, old_pm_idle);

	for_each_online_cpu(cpu)
		sys_tune_del_cpu(cpu);

	mutex_unlock(&sys_tune_startup_mutex);
	
	/* Our handler may still be executing on other CPUs.
	 * Schedule this thread on all CPUs to make sure all
	 * idle threads get interrupted. */
	sys_tune_refresh();

	/* Make sure the work item has finished executing on all CPUs.
	 * This in turn ensures that all idle threads have been
	 * interrupted. */
	flush_scheduled_work();
#endif /* CONFIG_X86 */
}

void sys_tune_init(void)
{
#if defined(CONFIG_X86) && defined(CONFIG_APM_MODULE)
	int cpu;

	for_each_possible_cpu(cpu) {
		INIT_WORK(&per_cpu(sys_tune_cpu_work, cpu),
			  sys_tune_work_func);
	}

	/* Start by registering the handler to ensure we don't miss
	 * any updates. */
	register_cpu_notifier(&sys_tune_cpu_nb);

	mutex_lock(&sys_tune_startup_mutex);

	for_each_online_cpu(cpu)
		sys_tune_add_cpu(cpu);

	sys_tune_ensure_init();


	mutex_unlock(&sys_tune_startup_mutex);

	/* Ensure our idle handler starts to run. */
	sys_tune_refresh();
#endif
}

OpenPOWER on IntegriCloud