ZFS tries to allocate blocks evenly across all devices. This means when

devices are imbalanced zfs will lots of CPU searching for space on devices which tend to be pretty full. It should instead fail quickly on the full devices and move onto devices which have more availability. New loader tunable: vfs.zfs.mg_alloc_failures (min = 8) Illumos-gate changeset: 13379:4df42cc92254 Obtained from: Illumos (Bug #1051) MFC after: 2 weeks
author: mm <mm@FreeBSD.org> 2011-07-18 08:29:49 +0000
committer: mm <mm@FreeBSD.org> 2011-07-18 08:29:49 +0000
commit: 94656ca305cce2dc4b2fc31d0f556398dea3a845 (patch)
tree: e82006ca212476e2a1fa75a95c28ae56cf01202f /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
parent: 5f3a3207a0d86fe328ed6c579d11c39c6d6a6a43 (diff)
download: FreeBSD-src-94656ca305cce2dc4b2fc31d0f556398dea3a845.zip
FreeBSD-src-94656ca305cce2dc4b2fc31d0f556398dea3a845.tar.gz
1 files changed, 84 insertions, 27 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
index 17b4b12..a0620b8 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -20,6 +20,7 @@
  */
 /*
  * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
+ * Copyright (c) 2011 by Delphix. All rights reserved.
  */
 
 #include <sys/zfs_context.h>
@@ -30,10 +31,35 @@
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 
+/*
+ * Allow allocations to switch to gang blocks quickly. We do this to
+ * avoid having to load lots of space_maps in a given txg. There are,
+ * however, some cases where we want to avoid "fast" ganging and instead
+ * we want to do an exhaustive search of all metaslabs on this device.
+ * Currently we don't allow any gang or dump device related allocations
+ * to "fast" gang.
+ */
+#define	CAN_FASTGANG(flags) \
+	(!((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER | \
+	METASLAB_GANG_AVOID)))
+
 uint64_t metaslab_aliquot = 512ULL << 10;
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 
 /*
+ * This value defines the number of allowed allocation failures per vdev.
+ * If a device reaches this threshold in a given txg then we consider skipping
+ * allocations on that device.
+ */
+int zfs_mg_alloc_failures = 0;
+
+SYSCTL_DECL(_vfs_zfs);
+SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_alloc_failures, CTLFLAG_RDTUN,
+    &zfs_mg_alloc_failures, 0,
+    "Number of allowed allocation failures per vdev");
+TUNABLE_INT("vfs.zfs.mg_alloc_failures", &zfs_mg_alloc_failures);
+
+/*
  * Metaslab debugging: when set, keeps all space maps in core to verify frees.
  */
 static int metaslab_debug = 0;
@@ -671,7 +697,7 @@ static space_map_ops_t metaslab_ndf_ops = {
 	metaslab_ndf_fragmented
 };
 
-space_map_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops;
+space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
 
 /*
  * ==========================================================================
@@ -844,7 +870,7 @@ metaslab_prefetch(metaslab_group_t *mg)
 }
 
 static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
 {
 	metaslab_group_t *mg = msp->ms_group;
 	space_map_t *sm = &msp->ms_map;
@@ -877,13 +903,6 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
 			mutex_exit(&mg->mg_lock);
 		}
 
-		/*
-		 * If we were able to load the map then make sure
-		 * that this map is still able to satisfy our request.
-		 */
-		if (msp->ms_weight < size)
-			return (ENOSPC);
-
 		metaslab_group_sort(msp->ms_group, msp,
 		    msp->ms_weight | activation_weight);
 	}
@@ -1099,6 +1118,7 @@ void
 metaslab_sync_reassess(metaslab_group_t *mg)
 {
 	vdev_t *vd = mg->mg_vd;
+	int64_t failures = mg->mg_alloc_failures;
 
 	/*
 	 * Re-evaluate all metaslabs which have lower offsets than the
@@ -1115,6 +1135,8 @@ metaslab_sync_reassess(metaslab_group_t *mg)
 		mutex_exit(&msp->ms_lock);
 	}
 
+	atomic_add_64(&mg->mg_alloc_failures, -failures);
+
 	/*
 	 * Prefetch the next potential metaslabs
 	 */
@@ -1139,9 +1161,10 @@ metaslab_distance(metaslab_t *msp, dva_t *dva)
 }
 
 static uint64_t
-metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
-    uint64_t min_distance, dva_t *dva, int d)
+metaslab_group_alloc(metaslab_group_t *mg, uint64_t psize, uint64_t asize,
+    uint64_t txg, uint64_t min_distance, dva_t *dva, int d, int flags)
 {
+	spa_t *spa = mg->mg_vd->vdev_spa;
 	metaslab_t *msp = NULL;
 	uint64_t offset = -1ULL;
 	avl_tree_t *t = &mg->mg_metaslab_tree;
@@ -1162,11 +1185,17 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 
 		mutex_enter(&mg->mg_lock);
 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
-			if (msp->ms_weight < size) {
+			if (msp->ms_weight < asize) {
+				spa_dbgmsg(spa, "%s: failed to meet weight "
+				    "requirement: vdev %llu, txg %llu, mg %p, "
+				    "msp %p, psize %llu, asize %llu, "
+				    "failures %llu, weight %llu",
+				    spa_name(spa), mg->mg_vd->vdev_id, txg,
+				    mg, msp, psize, asize,
+				    mg->mg_alloc_failures, msp->ms_weight);
 				mutex_exit(&mg->mg_lock);
 				return (-1ULL);
 			}
-
 			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
 				break;
@@ -1185,6 +1214,25 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 		if (msp == NULL)
 			return (-1ULL);
 
+		/*
+		 * If we've already reached the allowable number of failed
+		 * allocation attempts on this metaslab group then we
+		 * consider skipping it. We skip it only if we're allowed
+		 * to "fast" gang, the physical size is larger than
+		 * a gang block, and we're attempting to allocate from
+		 * the primary metaslab.
+		 */
+		if (mg->mg_alloc_failures > zfs_mg_alloc_failures &&
+		    CAN_FASTGANG(flags) && psize > SPA_GANGBLOCKSIZE &&
+		    activation_weight == METASLAB_WEIGHT_PRIMARY) {
+			spa_dbgmsg(spa, "%s: skipping metaslab group: "
+			    "vdev %llu, txg %llu, mg %p, psize %llu, "
+			    "asize %llu, failures %llu", spa_name(spa),
+			    mg->mg_vd->vdev_id, txg, mg, psize, asize,
+			    mg->mg_alloc_failures);
+			return (-1ULL);
+		}
+
 		mutex_enter(&msp->ms_lock);
 
 		/*
@@ -1193,7 +1241,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 		 * another thread may have changed the weight while we
 		 * were blocked on the metaslab lock.
 		 */
-		if (msp->ms_weight < size || (was_active &&
+		if (msp->ms_weight < asize || (was_active &&
 		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
 		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
 			mutex_exit(&msp->ms_lock);
@@ -1208,14 +1256,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 			continue;
 		}
 
-		if (metaslab_activate(msp, activation_weight, size) != 0) {
+		if (metaslab_activate(msp, activation_weight) != 0) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
 
-		if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
+		if ((offset = space_map_alloc(&msp->ms_map, asize)) != -1ULL)
 			break;
 
+		atomic_inc_64(&mg->mg_alloc_failures);
+
 		metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
 
 		mutex_exit(&msp->ms_lock);
@@ -1224,7 +1274,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 	if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
 		vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg);
 
-	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
+	space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, asize);
 
 	mutex_exit(&msp->ms_lock);
 
@@ -1351,7 +1401,8 @@ top:
 		asize = vdev_psize_to_asize(vd, psize);
 		ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0);
 
-		offset = metaslab_group_alloc(mg, asize, txg, distance, dva, d);
+		offset = metaslab_group_alloc(mg, psize, asize, txg, distance,
+		    dva, d, flags);
 		if (offset != -1ULL) {
 			/*
 			 * If we've just selected this metaslab group,
@@ -1363,18 +1414,24 @@ top:
 				vdev_stat_t *vs = &vd->vdev_stat;
 				int64_t vu, cu;
 
-				/*
-				 * Determine percent used in units of 0..1024.
-				 * (This is just to avoid floating point.)
-				 */
-				vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
-				cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
+				vu = (vs->vs_alloc * 100) / (vs->vs_space + 1);
+				cu = (mc->mc_alloc * 100) / (mc->mc_space + 1);
 
 				/*
-				 * Bias by at most +/- 25% of the aliquot.
+				 * Calculate how much more or less we should
+				 * try to allocate from this device during
+				 * this iteration around the rotor.
+				 * For example, if a device is 80% full
+				 * and the pool is 20% full then we should
+				 * reduce allocations by 60% on this device.
+				 *
+				 * mg_bias = (20 - 80) * 512K / 100 = -307K
+				 *
+				 * This reduces allocations by 307K for this
+				 * iteration.
 				 */
 				mg->mg_bias = ((cu - vu) *
-				    (int64_t)mg->mg_aliquot) / (1024 * 4);
+				    (int64_t)mg->mg_aliquot) / 100;
 			}
 
 			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
@@ -1488,7 +1545,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 	mutex_enter(&msp->ms_lock);
 
 	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
-		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
 
 	if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
 		error = ENOENT;
author	mm <mm@FreeBSD.org>	2011-07-18 08:29:49 +0000
committer	mm <mm@FreeBSD.org>	2011-07-18 08:29:49 +0000
commit	94656ca305cce2dc4b2fc31d0f556398dea3a845 (patch)
tree	e82006ca212476e2a1fa75a95c28ae56cf01202f /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
parent	5f3a3207a0d86fe328ed6c579d11c39c6d6a6a43 (diff)
download	FreeBSD-src-94656ca305cce2dc4b2fc31d0f556398dea3a845.zip FreeBSD-src-94656ca305cce2dc4b2fc31d0f556398dea3a845.tar.gz