md: introduce get_priority_stripe() to improve raid456 write performance

Improve write performance by preventing the delayed_list from dumping all its stripes onto the handle_list in one shot. Delayed stripes are now further delayed by being held on the 'hold_list'. The 'hold_list' is bypassed when: * a STRIPE_IO_STARTED stripe is found at the head of 'handle_list' * 'handle_list' is empty and i/o is being done to satisfy full stripe-width write requests * 'bypass_count' is less than 'bypass_threshold'. By default the threshold is 1, i.e. every other stripe handled is a preread stripe provided the top two conditions are false. Benchmark data: System: 2x Xeon 5150, 4x SATA, mem=1GB Baseline: 2.6.24-rc7 Configuration: mdadm --create /dev/md0 /dev/sd[b-e] -n 4 -l 5 --assume-clean Test1: dd if=/dev/zero of=/dev/md0 bs=1024k count=2048 * patched: +33% (stripe_cache_size = 256), +25% (stripe_cache_size = 512) Test2: tiobench --size 2048 --numruns 5 --block 4096 --block 131072 (XFS) * patched: +13% * patched + preread_bypass_threshold = 0: +37% Changes since v1: * reduce bypass_threshold from (chunk_size / sectors_per_chunk) to (1) and make it configurable. This defaults to fairness and modest performance gains out of the box. Changes since v2: * [neilb@suse.de]: kill STRIPE_PRIO_HI and preread_needed as they are not necessary, the important change was clearing STRIPE_DELAYED in add_stripe_bio and this has been moved out to make_request for the hang fix. * [neilb@suse.de]: simplify get_priority_stripe * [dan.j.williams@intel.com]: reset the bypass_count when ->hold_list is sampled empty (+11%) * [dan.j.williams@intel.com]: decrement the bypass_count at the detection of stripes being naturally promoted off of hold_list +2%. Note, resetting bypass_count instead of decrementing on these events yields +4% but that is probably too aggressive. Changes since v3: * cosmetic fixups Tested-by: James W. Laferriere <babydr@baby-dragons.com> Signed-off-by: Dan Williams <dan.j.williams@intel.com> Signed-off-by: Neil Brown <neilb@suse.de> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
author: Dan Williams <dan.j.williams@intel.com> 2008-04-28 02:15:53 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> 2008-04-28 08:58:42 -0700
commit: 8b3e6cdc53b7f29f7026955d6cb6902a49322a15 (patch)
tree: 69d9f22a526e687fe2892d327caa3fa6ddd65cf6 /drivers/md
parent: e46b272b6608783ed7aa7b0594871550ce20b849 (diff)
download: op-kernel-dev-8b3e6cdc53b7f29f7026955d6cb6902a49322a15.zip
op-kernel-dev-8b3e6cdc53b7f29f7026955d6cb6902a49322a15.tar.gz
1 files changed, 112 insertions, 10 deletions
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 4efec46..45eead6 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -63,6 +63,7 @@
 #define STRIPE_SHIFT		(PAGE_SHIFT - 9)
 #define STRIPE_SECTORS		(STRIPE_SIZE>>9)
 #define	IO_THRESHOLD		1
+#define BYPASS_THRESHOLD	1
 #define NR_HASH			(PAGE_SIZE / sizeof(struct hlist_head))
 #define HASH_MASK		(NR_HASH - 1)
 
@@ -398,6 +399,7 @@ static void ops_run_io(struct stripe_head *sh)
 
 	might_sleep();
 
+	set_bit(STRIPE_IO_STARTED, &sh->state);
 	for (i = disks; i--; ) {
 		int rw;
 		struct bio *bi;
@@ -1720,6 +1722,9 @@ handle_write_operations5(struct stripe_head *sh, int rcw, int expand)
 				locked++;
 			}
 		}
+		if (locked + 1 == disks)
+			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
+				atomic_inc(&sh->raid_conf->pending_full_writes);
 	} else {
 		BUG_ON(!(test_bit(R5_UPTODATE, &sh->dev[pd_idx].flags) ||
 			test_bit(R5_Wantcompute, &sh->dev[pd_idx].flags)));
@@ -1947,6 +1952,9 @@ handle_requests_to_failed_array(raid5_conf_t *conf, struct stripe_head *sh,
 					STRIPE_SECTORS, 0, 0);
 	}
 
+	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
+		if (atomic_dec_and_test(&conf->pending_full_writes))
+			md_wakeup_thread(conf->mddev->thread);
 }
 
 /* __handle_issuing_new_read_requests5 - returns 0 if there are no more disks
@@ -2149,6 +2157,10 @@ static void handle_completed_write_requests(raid5_conf_t *conf,
 							0);
 			}
 		}
+
+	if (test_and_clear_bit(STRIPE_FULL_WRITE, &sh->state))
+		if (atomic_dec_and_test(&conf->pending_full_writes))
+			md_wakeup_thread(conf->mddev->thread);
 }
 
 static void handle_issuing_new_write_requests5(raid5_conf_t *conf,
@@ -2333,6 +2345,9 @@ static void handle_issuing_new_write_requests6(raid5_conf_t *conf,
 				s->locked++;
 				set_bit(R5_Wantwrite, &sh->dev[i].flags);
 			}
+		if (s->locked == disks)
+			if (!test_and_set_bit(STRIPE_FULL_WRITE, &sh->state))
+				atomic_inc(&conf->pending_full_writes);
 		/* after a RECONSTRUCT_WRITE, the stripe MUST be in-sync */
 		set_bit(STRIPE_INSYNC, &sh->state);
 
@@ -3094,6 +3109,8 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
 		else
 			continue;
 
+		set_bit(STRIPE_IO_STARTED, &sh->state);
+
 		bi = &sh->dev[i].req;
 
 		bi->bi_rw = rw;
@@ -3164,7 +3181,7 @@ static void raid5_activate_delayed(raid5_conf_t *conf)
 			clear_bit(STRIPE_DELAYED, &sh->state);
 			if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state))
 				atomic_inc(&conf->preread_active_stripes);
-			list_add_tail(&sh->lru, &conf->handle_list);
+			list_add_tail(&sh->lru, &conf->hold_list);
 		}
 	} else
 		blk_plug_device(conf->mddev->queue);
@@ -3442,6 +3459,58 @@ static int chunk_aligned_read(struct request_queue *q, struct bio * raid_bio)
 	}
 }
 
+/* __get_priority_stripe - get the next stripe to process
+ *
+ * Full stripe writes are allowed to pass preread active stripes up until
+ * the bypass_threshold is exceeded.  In general the bypass_count
+ * increments when the handle_list is handled before the hold_list; however, it
+ * will not be incremented when STRIPE_IO_STARTED is sampled set signifying a
+ * stripe with in flight i/o.  The bypass_count will be reset when the
+ * head of the hold_list has changed, i.e. the head was promoted to the
+ * handle_list.
+ */
+static struct stripe_head *__get_priority_stripe(raid5_conf_t *conf)
+{
+	struct stripe_head *sh;
+
+	pr_debug("%s: handle: %s hold: %s full_writes: %d bypass_count: %d\n",
+		  __func__,
+		  list_empty(&conf->handle_list) ? "empty" : "busy",
+		  list_empty(&conf->hold_list) ? "empty" : "busy",
+		  atomic_read(&conf->pending_full_writes), conf->bypass_count);
+
+	if (!list_empty(&conf->handle_list)) {
+		sh = list_entry(conf->handle_list.next, typeof(*sh), lru);
+
+		if (list_empty(&conf->hold_list))
+			conf->bypass_count = 0;
+		else if (!test_bit(STRIPE_IO_STARTED, &sh->state)) {
+			if (conf->hold_list.next == conf->last_hold)
+				conf->bypass_count++;
+			else {
+				conf->last_hold = conf->hold_list.next;
+				conf->bypass_count -= conf->bypass_threshold;
+				if (conf->bypass_count < 0)
+					conf->bypass_count = 0;
+			}
+		}
+	} else if (!list_empty(&conf->hold_list) &&
+		   ((conf->bypass_threshold &&
+		     conf->bypass_count > conf->bypass_threshold) ||
+		    atomic_read(&conf->pending_full_writes) == 0)) {
+		sh = list_entry(conf->hold_list.next,
+				typeof(*sh), lru);
+		conf->bypass_count -= conf->bypass_threshold;
+		if (conf->bypass_count < 0)
+			conf->bypass_count = 0;
+	} else
+		return NULL;
+
+	list_del_init(&sh->lru);
+	atomic_inc(&sh->count);
+	BUG_ON(atomic_read(&sh->count) != 1);
+	return sh;
+}
 
 static int make_request(struct request_queue *q, struct bio * bi)
 {
@@ -3914,7 +3983,6 @@ static void raid5d(mddev_t *mddev)
 	handled = 0;
 	spin_lock_irq(&conf->device_lock);
 	while (1) {
-		struct list_head *first;
 		struct bio *bio;
 
 		if (conf->seq_flush != conf->seq_write) {
@@ -3936,17 +4004,12 @@ static void raid5d(mddev_t *mddev)
 			handled++;
 		}
 
-		if (list_empty(&conf->handle_list)) {
+		sh = __get_priority_stripe(conf);
+
+		if (!sh) {
 			async_tx_issue_pending_all();
 			break;
 		}
-
-		first = conf->handle_list.next;
-		sh = list_entry(first, struct stripe_head, lru);
-
-		list_del_init(first);
-		atomic_inc(&sh->count);
-		BUG_ON(atomic_read(&sh->count)!= 1);
 		spin_unlock_irq(&conf->device_lock);
 		
 		handled++;
@@ -4011,6 +4074,42 @@ raid5_stripecache_size = __ATTR(stripe_cache_size, S_IRUGO | S_IWUSR,
 				raid5_store_stripe_cache_size);
 
 static ssize_t
+raid5_show_preread_threshold(mddev_t *mddev, char *page)
+{
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+	if (conf)
+		return sprintf(page, "%d\n", conf->bypass_threshold);
+	else
+		return 0;
+}
+
+static ssize_t
+raid5_store_preread_threshold(mddev_t *mddev, const char *page, size_t len)
+{
+	raid5_conf_t *conf = mddev_to_conf(mddev);
+	char *end;
+	int new;
+	if (len >= PAGE_SIZE)
+		return -EINVAL;
+	if (!conf)
+		return -ENODEV;
+
+	new = simple_strtoul(page, &end, 10);
+	if (!*page || (*end && *end != '\n'))
+		return -EINVAL;
+	if (new > conf->max_nr_stripes || new < 0)
+		return -EINVAL;
+	conf->bypass_threshold = new;
+	return len;
+}
+
+static struct md_sysfs_entry
+raid5_preread_bypass_threshold = __ATTR(preread_bypass_threshold,
+					S_IRUGO | S_IWUSR,
+					raid5_show_preread_threshold,
+					raid5_store_preread_threshold);
+
+static ssize_t
 stripe_cache_active_show(mddev_t *mddev, char *page)
 {
 	raid5_conf_t *conf = mddev_to_conf(mddev);
@@ -4026,6 +4125,7 @@ raid5_stripecache_active = __ATTR_RO(stripe_cache_active);
 static struct attribute *raid5_attrs[] =  {
 	&raid5_stripecache_size.attr,
 	&raid5_stripecache_active.attr,
+	&raid5_preread_bypass_threshold.attr,
 	NULL,
 };
 static struct attribute_group raid5_attrs_group = {
@@ -4130,12 +4230,14 @@ static int run(mddev_t *mddev)
 	init_waitqueue_head(&conf->wait_for_stripe);
 	init_waitqueue_head(&conf->wait_for_overlap);
 	INIT_LIST_HEAD(&conf->handle_list);
+	INIT_LIST_HEAD(&conf->hold_list);
 	INIT_LIST_HEAD(&conf->delayed_list);
 	INIT_LIST_HEAD(&conf->bitmap_list);
 	INIT_LIST_HEAD(&conf->inactive_list);
 	atomic_set(&conf->active_stripes, 0);
 	atomic_set(&conf->preread_active_stripes, 0);
 	atomic_set(&conf->active_aligned_reads, 0);
+	conf->bypass_threshold = BYPASS_THRESHOLD;
 
 	pr_debug("raid5: run(%s) called.\n", mdname(mddev));
author	Dan Williams <dan.j.williams@intel.com>	2008-04-28 02:15:53 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	2008-04-28 08:58:42 -0700
commit	8b3e6cdc53b7f29f7026955d6cb6902a49322a15 (patch)
tree	69d9f22a526e687fe2892d327caa3fa6ddd65cf6 /drivers/md
parent	e46b272b6608783ed7aa7b0594871550ce20b849 (diff)
download	op-kernel-dev-8b3e6cdc53b7f29f7026955d6cb6902a49322a15.zip op-kernel-dev-8b3e6cdc53b7f29f7026955d6cb6902a49322a15.tar.gz