4 files changed, 61 insertions, 99 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 67642ba..915e84d 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7610,16 +7610,12 @@ EXPORT_SYMBOL(unregister_md_cluster_operations);
 
 int md_setup_cluster(struct mddev *mddev, int nodes)
 {
-	int err;
-
-	err = request_module("md-cluster");
-	if (err) {
-		pr_err("md-cluster module not found.\n");
-		return -ENOENT;
-	}
-
+	if (!md_cluster_ops)
+		request_module("md-cluster");
 	spin_lock(&pers_lock);
+	/* ensure module won't be unloaded */
 	if (!md_cluster_ops || !try_module_get(md_cluster_mod)) {
+		pr_err("can't find md-cluster module or get it's reference.\n");
 		spin_unlock(&pers_lock);
 		return -ENOENT;
 	}
diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c
index 51f76dd..1b1ab4a 100644
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@@ -96,7 +96,6 @@ struct r5l_log {
 	spinlock_t no_space_stripes_lock;
 
 	bool need_cache_flush;
-	bool in_teardown;
 };
 
 /*
@@ -704,31 +703,22 @@ static void r5l_write_super_and_discard_space(struct r5l_log *log,
 
 	mddev = log->rdev->mddev;
 	/*
-	 * This is to avoid a deadlock. r5l_quiesce holds reconfig_mutex and
-	 * wait for this thread to finish. This thread waits for
-	 * MD_CHANGE_PENDING clear, which is supposed to be done in
-	 * md_check_recovery(). md_check_recovery() tries to get
-	 * reconfig_mutex. Since r5l_quiesce already holds the mutex,
-	 * md_check_recovery() fails, so the PENDING never get cleared. The
-	 * in_teardown check workaround this issue.
+	 * Discard could zero data, so before discard we must make sure
+	 * superblock is updated to new log tail. Updating superblock (either
+	 * directly call md_update_sb() or depend on md thread) must hold
+	 * reconfig mutex. On the other hand, raid5_quiesce is called with
+	 * reconfig_mutex hold. The first step of raid5_quiesce() is waitting
+	 * for all IO finish, hence waitting for reclaim thread, while reclaim
+	 * thread is calling this function and waitting for reconfig mutex. So
+	 * there is a deadlock. We workaround this issue with a trylock.
+	 * FIXME: we could miss discard if we can't take reconfig mutex
 	 */
-	if (!log->in_teardown) {
-		set_mask_bits(&mddev->flags, 0,
-			      BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
-		md_wakeup_thread(mddev->thread);
-		wait_event(mddev->sb_wait,
-			!test_bit(MD_CHANGE_PENDING, &mddev->flags) ||
-			log->in_teardown);
-		/*
-		 * r5l_quiesce could run after in_teardown check and hold
-		 * mutex first. Superblock might get updated twice.
-		 */
-		if (log->in_teardown)
-			md_update_sb(mddev, 1);
-	} else {
-		WARN_ON(!mddev_is_locked(mddev));
-		md_update_sb(mddev, 1);
-	}
+	set_mask_bits(&mddev->flags, 0,
+		BIT(MD_CHANGE_DEVS) | BIT(MD_CHANGE_PENDING));
+	if (!mddev_trylock(mddev))
+		return;
+	md_update_sb(mddev, 1);
+	mddev_unlock(mddev);
 
 	/* discard IO error really doesn't matter, ignore it */
 	if (log->last_checkpoint < end) {
@@ -827,7 +817,6 @@ void r5l_quiesce(struct r5l_log *log, int state)
 	if (!log || state == 2)
 		return;
 	if (state == 0) {
-		log->in_teardown = 0;
 		/*
 		 * This is a special case for hotadd. In suspend, the array has
 		 * no journal. In resume, journal is initialized as well as the
@@ -838,11 +827,6 @@ void r5l_quiesce(struct r5l_log *log, int state)
 		log->reclaim_thread = md_register_thread(r5l_reclaim_thread,
 					log->rdev->mddev, "reclaim");
 	} else if (state == 1) {
-		/*
-		 * at this point all stripes are finished, so io_unit is at
-		 * least in STRIPE_END state
-		 */
-		log->in_teardown = 1;
 		/* make sure r5l_write_super_and_discard_space exits */
 		mddev = log->rdev->mddev;
 		wake_up(&mddev->sb_wait);
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index da583bb..5287e79 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2423,10 +2423,10 @@ static void raid5_end_read_request(struct bio * bi)
 		}
 	}
 	rdev_dec_pending(rdev, conf->mddev);
+	bio_reset(bi);
 	clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
 	raid5_release_stripe(sh);
-	bio_reset(bi);
 }
 
 static void raid5_end_write_request(struct bio *bi)
@@ -2498,6 +2498,7 @@ static void raid5_end_write_request(struct bio *bi)
 	if (sh->batch_head && bi->bi_error && !replacement)
 		set_bit(STRIPE_BATCH_ERR, &sh->batch_head->state);
 
+	bio_reset(bi);
 	if (!test_and_clear_bit(R5_DOUBLE_LOCKED, &sh->dev[i].flags))
 		clear_bit(R5_LOCKED, &sh->dev[i].flags);
 	set_bit(STRIPE_HANDLE, &sh->state);
@@ -2505,7 +2506,6 @@ static void raid5_end_write_request(struct bio *bi)
 
 	if (sh->batch_head && sh != sh->batch_head)
 		raid5_release_stripe(sh->batch_head);
-	bio_reset(bi);
 }
 
 static void raid5_build_block(struct stripe_head *sh, int i, int previous)
@@ -6349,22 +6349,20 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu
 	return 0;
 }
 
-static void raid5_free_percpu(struct r5conf *conf)
+static int raid456_cpu_dead(unsigned int cpu, struct hlist_node *node)
 {
-	unsigned long cpu;
+	struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
 
+	free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
+	return 0;
+}
+
+static void raid5_free_percpu(struct r5conf *conf)
+{
 	if (!conf->percpu)
 		return;
 
-#ifdef CONFIG_HOTPLUG_CPU
-	unregister_cpu_notifier(&conf->cpu_notify);
-#endif
-
-	get_online_cpus();
-	for_each_possible_cpu(cpu)
-		free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
-	put_online_cpus();
-
+	cpuhp_state_remove_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
 	free_percpu(conf->percpu);
 }
 
@@ -6383,64 +6381,28 @@ static void free_conf(struct r5conf *conf)
 	kfree(conf);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-static int raid456_cpu_notify(struct notifier_block *nfb, unsigned long action,
-			      void *hcpu)
+static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node)
 {
-	struct r5conf *conf = container_of(nfb, struct r5conf, cpu_notify);
-	long cpu = (long)hcpu;
+	struct r5conf *conf = hlist_entry_safe(node, struct r5conf, node);
 	struct raid5_percpu *percpu = per_cpu_ptr(conf->percpu, cpu);
 
-	switch (action) {
-	case CPU_UP_PREPARE:
-	case CPU_UP_PREPARE_FROZEN:
-		if (alloc_scratch_buffer(conf, percpu)) {
-			pr_err("%s: failed memory allocation for cpu%ld\n",
-			       __func__, cpu);
-			return notifier_from_errno(-ENOMEM);
-		}
-		break;
-	case CPU_DEAD:
-	case CPU_DEAD_FROZEN:
-	case CPU_UP_CANCELED:
-	case CPU_UP_CANCELED_FROZEN:
-		free_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
-		break;
-	default:
-		break;
+	if (alloc_scratch_buffer(conf, percpu)) {
+		pr_err("%s: failed memory allocation for cpu%u\n",
+		       __func__, cpu);
+		return -ENOMEM;
 	}
-	return NOTIFY_OK;
+	return 0;
 }
-#endif
 
 static int raid5_alloc_percpu(struct r5conf *conf)
 {
-	unsigned long cpu;
 	int err = 0;
 
 	conf->percpu = alloc_percpu(struct raid5_percpu);
 	if (!conf->percpu)
 		return -ENOMEM;
 
-#ifdef CONFIG_HOTPLUG_CPU
-	conf->cpu_notify.notifier_call = raid456_cpu_notify;
-	conf->cpu_notify.priority = 0;
-	err = register_cpu_notifier(&conf->cpu_notify);
-	if (err)
-		return err;
-#endif
-
-	get_online_cpus();
-	for_each_present_cpu(cpu) {
-		err = alloc_scratch_buffer(conf, per_cpu_ptr(conf->percpu, cpu));
-		if (err) {
-			pr_err("%s: failed memory allocation for cpu%ld\n",
-			       __func__, cpu);
-			break;
-		}
-	}
-	put_online_cpus();
-
+	err = cpuhp_state_add_instance(CPUHP_MD_RAID5_PREPARE, &conf->node);
 	if (!err) {
 		conf->scribble_disks = max(conf->raid_disks,
 			conf->previous_raid_disks);
@@ -6639,6 +6601,16 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	}
 
 	conf->min_nr_stripes = NR_STRIPES;
+	if (mddev->reshape_position != MaxSector) {
+		int stripes = max_t(int,
+			((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4,
+			((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4);
+		conf->min_nr_stripes = max(NR_STRIPES, stripes);
+		if (conf->min_nr_stripes != NR_STRIPES)
+			printk(KERN_INFO
+				"md/raid:%s: force stripe size %d for reshape\n",
+				mdname(mddev), conf->min_nr_stripes);
+	}
 	memory = conf->min_nr_stripes * (sizeof(struct stripe_head) +
 		 max_disks * ((sizeof(struct bio) + PAGE_SIZE))) / 1024;
 	atomic_set(&conf->empty_inactive_list_nr, NR_STRIPE_HASH_LOCKS);
@@ -7975,10 +7947,21 @@ static struct md_personality raid4_personality =
 
 static int __init raid5_init(void)
 {
+	int ret;
+
 	raid5_wq = alloc_workqueue("raid5wq",
 		WQ_UNBOUND|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE|WQ_SYSFS, 0);
 	if (!raid5_wq)
 		return -ENOMEM;
+
+	ret = cpuhp_setup_state_multi(CPUHP_MD_RAID5_PREPARE,
+				      "md/raid5:prepare",
+				      raid456_cpu_up_prepare,
+				      raid456_cpu_dead);
+	if (ret) {
+		destroy_workqueue(raid5_wq);
+		return ret;
+	}
 	register_md_personality(&raid6_personality);
 	register_md_personality(&raid5_personality);
 	register_md_personality(&raid4_personality);
@@ -7990,6 +7973,7 @@ static void raid5_exit(void)
 	unregister_md_personality(&raid6_personality);
 	unregister_md_personality(&raid5_personality);
 	unregister_md_personality(&raid4_personality);
+	cpuhp_remove_multi_state(CPUHP_MD_RAID5_PREPARE);
 	destroy_workqueue(raid5_wq);
 }
 
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 517d4b6..57ec49f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -512,9 +512,7 @@ struct r5conf {
 	} __percpu *percpu;
 	int scribble_disks;
 	int scribble_sectors;
-#ifdef CONFIG_HOTPLUG_CPU
-	struct notifier_block	cpu_notify;
-#endif
+	struct hlist_node node;
 
 	/*
 	 * Free stripes pool