1 files changed, 131 insertions, 54 deletions
diff --git a/kernel/padata.c b/kernel/padata.c
index fd03513..b1c9857 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -29,7 +29,7 @@
 #include <linux/rcupdate.h>
 
 #define MAX_SEQ_NR INT_MAX - NR_CPUS
-#define MAX_OBJ_NUM 10000 * NR_CPUS
+#define MAX_OBJ_NUM 1000
 
 static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
 {
@@ -88,7 +88,7 @@ static void padata_parallel_worker(struct work_struct *work)
 	local_bh_enable();
 }
 
-/*
+/**
  * padata_do_parallel - padata parallelization function
  *
  * @pinst: padata instance
@@ -152,6 +152,23 @@ out:
 }
 EXPORT_SYMBOL(padata_do_parallel);
 
+/*
+ * padata_get_next - Get the next object that needs serialization.
+ *
+ * Return values are:
+ *
+ * A pointer to the control struct of the next object that needs
+ * serialization, if present in one of the percpu reorder queues.
+ *
+ * NULL, if all percpu reorder queues are empty.
+ *
+ * -EINPROGRESS, if the next object that needs serialization will
+ *  be parallel processed by another cpu and is not yet present in
+ *  the cpu's reorder queue.
+ *
+ * -ENODATA, if this cpu has to do the parallel processing for
+ *  the next object.
+ */
 static struct padata_priv *padata_get_next(struct parallel_data *pd)
 {
 	int cpu, num_cpus, empty, calc_seq_nr;
@@ -173,7 +190,7 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
 
 		/*
 		 * Calculate the seq_nr of the object that should be
-		 * next in this queue.
+		 * next in this reorder queue.
 		 */
 		overrun = 0;
 		calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus)
@@ -231,7 +248,8 @@ static struct padata_priv *padata_get_next(struct parallel_data *pd)
 		goto out;
 	}
 
-	if (next_nr % num_cpus == next_queue->cpu_index) {
+	queue = per_cpu_ptr(pd->queue, smp_processor_id());
+	if (queue->cpu_index == next_queue->cpu_index) {
 		padata = ERR_PTR(-ENODATA);
 		goto out;
 	}
@@ -247,19 +265,40 @@ static void padata_reorder(struct parallel_data *pd)
 	struct padata_queue *queue;
 	struct padata_instance *pinst = pd->pinst;
 
-try_again:
+	/*
+	 * We need to ensure that only one cpu can work on dequeueing of
+	 * the reorder queue the time. Calculating in which percpu reorder
+	 * queue the next object will arrive takes some time. A spinlock
+	 * would be highly contended. Also it is not clear in which order
+	 * the objects arrive to the reorder queues. So a cpu could wait to
+	 * get the lock just to notice that there is nothing to do at the
+	 * moment. Therefore we use a trylock and let the holder of the lock
+	 * care for all the objects enqueued during the holdtime of the lock.
+	 */
 	if (!spin_trylock_bh(&pd->lock))
-		goto out;
+		return;
 
 	while (1) {
 		padata = padata_get_next(pd);
 
+		/*
+		 * All reorder queues are empty, or the next object that needs
+		 * serialization is parallel processed by another cpu and is
+		 * still on it's way to the cpu's reorder queue, nothing to
+		 * do for now.
+		 */
 		if (!padata || PTR_ERR(padata) == -EINPROGRESS)
 			break;
 
+		/*
+		 * This cpu has to do the parallel processing of the next
+		 * object. It's waiting in the cpu's parallelization queue,
+		 * so exit imediately.
+		 */
 		if (PTR_ERR(padata) == -ENODATA) {
+			del_timer(&pd->timer);
 			spin_unlock_bh(&pd->lock);
-			goto out;
+			return;
 		}
 
 		queue = per_cpu_ptr(pd->queue, padata->cb_cpu);
@@ -273,13 +312,27 @@ try_again:
 
 	spin_unlock_bh(&pd->lock);
 
-	if (atomic_read(&pd->reorder_objects))
-		goto try_again;
+	/*
+	 * The next object that needs serialization might have arrived to
+	 * the reorder queues in the meantime, we will be called again
+	 * from the timer function if noone else cares for it.
+	 */
+	if (atomic_read(&pd->reorder_objects)
+			&& !(pinst->flags & PADATA_RESET))
+		mod_timer(&pd->timer, jiffies + HZ);
+	else
+		del_timer(&pd->timer);
 
-out:
 	return;
 }
 
+static void padata_reorder_timer(unsigned long arg)
+{
+	struct parallel_data *pd = (struct parallel_data *)arg;
+
+	padata_reorder(pd);
+}
+
 static void padata_serial_worker(struct work_struct *work)
 {
 	struct padata_queue *queue;
@@ -308,7 +361,7 @@ static void padata_serial_worker(struct work_struct *work)
 	local_bh_enable();
 }
 
-/*
+/**
  * padata_do_serial - padata serialization function
  *
  * @padata: object to be serialized.
@@ -338,6 +391,7 @@ void padata_do_serial(struct padata_priv *padata)
 }
 EXPORT_SYMBOL(padata_do_serial);
 
+/* Allocate and initialize the internal cpumask dependend resources. */
 static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
 					     const struct cpumask *cpumask)
 {
@@ -358,17 +412,15 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
 	if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL))
 		goto err_free_queue;
 
-	for_each_possible_cpu(cpu) {
+	cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
+
+	for_each_cpu(cpu, pd->cpumask) {
 		queue = per_cpu_ptr(pd->queue, cpu);
 
 		queue->pd = pd;
 
-		if (cpumask_test_cpu(cpu, cpumask)
-		    && cpumask_test_cpu(cpu, cpu_active_mask)) {
-			queue->cpu_index = cpu_index;
-			cpu_index++;
-		} else
-			queue->cpu_index = -1;
+		queue->cpu_index = cpu_index;
+		cpu_index++;
 
 		INIT_LIST_HEAD(&queue->reorder.list);
 		INIT_LIST_HEAD(&queue->parallel.list);
@@ -382,11 +434,10 @@ static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst,
 		atomic_set(&queue->num_obj, 0);
 	}
 
-	cpumask_and(pd->cpumask, cpumask, cpu_active_mask);
-
 	num_cpus = cpumask_weight(pd->cpumask);
 	pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1;
 
+	setup_timer(&pd->timer, padata_reorder_timer, (unsigned long)pd);
 	atomic_set(&pd->seq_nr, -1);
 	atomic_set(&pd->reorder_objects, 0);
 	atomic_set(&pd->refcnt, 0);
@@ -410,6 +461,31 @@ static void padata_free_pd(struct parallel_data *pd)
 	kfree(pd);
 }
 
+/* Flush all objects out of the padata queues. */
+static void padata_flush_queues(struct parallel_data *pd)
+{
+	int cpu;
+	struct padata_queue *queue;
+
+	for_each_cpu(cpu, pd->cpumask) {
+		queue = per_cpu_ptr(pd->queue, cpu);
+		flush_work(&queue->pwork);
+	}
+
+	del_timer_sync(&pd->timer);
+
+	if (atomic_read(&pd->reorder_objects))
+		padata_reorder(pd);
+
+	for_each_cpu(cpu, pd->cpumask) {
+		queue = per_cpu_ptr(pd->queue, cpu);
+		flush_work(&queue->swork);
+	}
+
+	BUG_ON(atomic_read(&pd->refcnt) != 0);
+}
+
+/* Replace the internal control stucture with a new one. */
 static void padata_replace(struct padata_instance *pinst,
 			   struct parallel_data *pd_new)
 {
@@ -421,17 +497,13 @@ static void padata_replace(struct padata_instance *pinst,
 
 	synchronize_rcu();
 
-	while (atomic_read(&pd_old->refcnt) != 0)
-		yield();
-
-	flush_workqueue(pinst->wq);
-
+	padata_flush_queues(pd_old);
 	padata_free_pd(pd_old);
 
 	pinst->flags &= ~PADATA_RESET;
 }
 
-/*
+/**
  * padata_set_cpumask - set the cpumask that padata should use
  *
  * @pinst: padata instance
@@ -443,10 +515,10 @@ int padata_set_cpumask(struct padata_instance *pinst,
 	struct parallel_data *pd;
 	int err = 0;
 
-	might_sleep();
-
 	mutex_lock(&pinst->lock);
 
+	get_online_cpus();
+
 	pd = padata_alloc_pd(pinst, cpumask);
 	if (!pd) {
 		err = -ENOMEM;
@@ -458,6 +530,8 @@ int padata_set_cpumask(struct padata_instance *pinst,
 	padata_replace(pinst, pd);
 
 out:
+	put_online_cpus();
+
 	mutex_unlock(&pinst->lock);
 
 	return err;
@@ -479,7 +553,7 @@ static int __padata_add_cpu(struct padata_instance *pinst, int cpu)
 	return 0;
 }
 
-/*
+/**
  * padata_add_cpu - add a cpu to the padata cpumask
  *
  * @pinst: padata instance
@@ -489,12 +563,12 @@ int padata_add_cpu(struct padata_instance *pinst, int cpu)
 {
 	int err;
 
-	might_sleep();
-
 	mutex_lock(&pinst->lock);
 
+	get_online_cpus();
 	cpumask_set_cpu(cpu, pinst->cpumask);
 	err = __padata_add_cpu(pinst, cpu);
+	put_online_cpus();
 
 	mutex_unlock(&pinst->lock);
 
@@ -517,7 +591,7 @@ static int __padata_remove_cpu(struct padata_instance *pinst, int cpu)
 	return 0;
 }
 
-/*
+/**
  * padata_remove_cpu - remove a cpu from the padata cpumask
  *
  * @pinst: padata instance
@@ -527,12 +601,12 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu)
 {
 	int err;
 
-	might_sleep();
-
 	mutex_lock(&pinst->lock);
 
+	get_online_cpus();
 	cpumask_clear_cpu(cpu, pinst->cpumask);
 	err = __padata_remove_cpu(pinst, cpu);
+	put_online_cpus();
 
 	mutex_unlock(&pinst->lock);
 
@@ -540,38 +614,35 @@ int padata_remove_cpu(struct padata_instance *pinst, int cpu)
 }
 EXPORT_SYMBOL(padata_remove_cpu);
 
-/*
+/**
  * padata_start - start the parallel processing
  *
  * @pinst: padata instance to start
  */
 void padata_start(struct padata_instance *pinst)
 {
-	might_sleep();
-
 	mutex_lock(&pinst->lock);
 	pinst->flags |= PADATA_INIT;
 	mutex_unlock(&pinst->lock);
 }
 EXPORT_SYMBOL(padata_start);
 
-/*
+/**
  * padata_stop - stop the parallel processing
  *
  * @pinst: padata instance to stop
  */
 void padata_stop(struct padata_instance *pinst)
 {
-	might_sleep();
-
 	mutex_lock(&pinst->lock);
 	pinst->flags &= ~PADATA_INIT;
 	mutex_unlock(&pinst->lock);
 }
 EXPORT_SYMBOL(padata_stop);
 
-static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
-					 unsigned long action, void *hcpu)
+#ifdef CONFIG_HOTPLUG_CPU
+static int padata_cpu_callback(struct notifier_block *nfb,
+			       unsigned long action, void *hcpu)
 {
 	int err;
 	struct padata_instance *pinst;
@@ -621,8 +692,9 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
 
 	return NOTIFY_OK;
 }
+#endif
 
-/*
+/**
  * padata_alloc - allocate and initialize a padata instance
  *
  * @cpumask: cpumask that padata uses for parallelization
@@ -631,7 +703,6 @@ static int __cpuinit padata_cpu_callback(struct notifier_block *nfb,
 struct padata_instance *padata_alloc(const struct cpumask *cpumask,
 				     struct workqueue_struct *wq)
 {
-	int err;
 	struct padata_instance *pinst;
 	struct parallel_data *pd;
 
@@ -639,6 +710,8 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
 	if (!pinst)
 		goto err;
 
+	get_online_cpus();
+
 	pd = padata_alloc_pd(pinst, cpumask);
 	if (!pd)
 		goto err_free_inst;
@@ -654,31 +727,32 @@ struct padata_instance *padata_alloc(const struct cpumask *cpumask,
 
 	pinst->flags = 0;
 
+#ifdef CONFIG_HOTPLUG_CPU
 	pinst->cpu_notifier.notifier_call = padata_cpu_callback;
 	pinst->cpu_notifier.priority = 0;
-	err = register_hotcpu_notifier(&pinst->cpu_notifier);
-	if (err)
-		goto err_free_cpumask;
+	register_hotcpu_notifier(&pinst->cpu_notifier);
+#endif
+
+	put_online_cpus();
 
 	mutex_init(&pinst->lock);
 
 	return pinst;
 
-err_free_cpumask:
-	free_cpumask_var(pinst->cpumask);
 err_free_pd:
 	padata_free_pd(pd);
 err_free_inst:
 	kfree(pinst);
+	put_online_cpus();
 err:
 	return NULL;
 }
 EXPORT_SYMBOL(padata_alloc);
 
-/*
+/**
  * padata_free - free a padata instance
  *
- * @ padata_inst: padata instance to free
+ * @padata_inst: padata instance to free
  */
 void padata_free(struct padata_instance *pinst)
 {
@@ -686,10 +760,13 @@ void padata_free(struct padata_instance *pinst)
 
 	synchronize_rcu();
 
-	while (atomic_read(&pinst->pd->refcnt) != 0)
-		yield();
-
+#ifdef CONFIG_HOTPLUG_CPU
 	unregister_hotcpu_notifier(&pinst->cpu_notifier);
+#endif
+	get_online_cpus();
+	padata_flush_queues(pinst->pd);
+	put_online_cpus();
+
 	padata_free_pd(pinst->pd);
 	free_cpumask_var(pinst->cpumask);
 	kfree(pinst);