MFC r268855: MFV r268848:

Instead of asserting all zio's be properly aligned, only assert on the logical ones. Cap uberblocks at 8k, otherwise with ashift=17, there would be only one uberblock. This fixes a problem that zdb would trip assert on pools with ashift >= 0xe (8k). While there, also change the code so it only attempt to condense space map unless the uncondensed size consumes greater than zfs_metaslab_condense_block_threshold blocks. Illumos issue: 4958 zdb trips assert on pools with ashift >= 0xe
author: delphij <delphij@FreeBSD.org> 2014-08-02 03:56:06 +0000
committer: delphij <delphij@FreeBSD.org> 2014-08-02 03:56:06 +0000
commit: 6a949e106dc93d9fa177f78a454fab1d92b5c5e7 (patch)
tree: 7404e6333ff3ecd4049cbff81119e02d6331e597 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
parent: 4b7aa9ea8a7bce8b0826c0fb5337074a32500239 (diff)
download: FreeBSD-src-6a949e106dc93d9fa177f78a454fab1d92b5c5e7.zip
FreeBSD-src-6a949e106dc93d9fa177f78a454fab1d92b5c5e7.tar.gz
1 files changed, 35 insertions, 4 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
index d834e83..d6e3ce0 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -74,6 +74,21 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN,
     " of in-memory counterpart");
 
 /*
+ * Condensing a metaslab is not guaranteed to actually reduce the amount of
+ * space used on disk. In particular, a space map uses data in increments of
+ * MAX(1 << ashift, SPACE_MAP_INITIAL_BLOCKSIZE), so a metaslab might use the
+ * same number of blocks after condensing. Since the goal of condensing is to
+ * reduce the number of IOPs required to read the space map, we only want to
+ * condense when we can be sure we will reduce the number of blocks used by the
+ * space map. Unfortunately, we cannot precisely compute whether or not this is
+ * the case in metaslab_should_condense since we are holding ms_lock. Instead,
+ * we apply the following heuristic: do not condense a spacemap unless the
+ * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold
+ * blocks.
+ */
+int zfs_metaslab_condense_block_threshold = 4;
+
+/*
  * The zfs_mg_noalloc_threshold defines which metaslab groups should
  * be eligible for allocation. The value is defined as a percentage of
  * a free space. Metaslab groups that have more free space than
@@ -1371,6 +1386,8 @@ metaslab_group_preload(metaslab_group_t *mg)
  * times the size than the free space range tree representation
  * (i.e. zfs_condense_pct = 110 and in-core = 1MB, minimal = 1.1.MB).
  *
+ * 3. The on-disk size of the space map should actually decrease.
+ *
  * Checking the first condition is tricky since we don't want to walk
  * the entire AVL tree calculating the estimated on-disk size. Instead we
  * use the size-ordered range tree in the metaslab and calculate the
@@ -1381,13 +1398,21 @@ metaslab_group_preload(metaslab_group_t *mg)
  * To determine the second criterion we use a best-case estimate and assume
  * each segment can be represented on-disk as a single 64-bit entry. We refer
  * to this best-case estimate as the space map's minimal form.
+ *
+ * Unfortunately, we cannot compute the on-disk size of the space map in this
+ * context because we cannot accurately compute the effects of compression, etc.
+ * Instead, we apply the heuristic described in the block comment for
+ * zfs_metaslab_condense_block_threshold - we only condense if the space used
+ * is greater than a threshold number of blocks.
  */
 static boolean_t
 metaslab_should_condense(metaslab_t *msp)
 {
 	space_map_t *sm = msp->ms_sm;
 	range_seg_t *rs;
-	uint64_t size, entries, segsz;
+	uint64_t size, entries, segsz, object_size, optimal_size, record_size;
+	dmu_object_info_t doi;
+	uint64_t vdev_blocksize = 1 << msp->ms_group->mg_vd->vdev_ashift;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 	ASSERT(msp->ms_loaded);
@@ -1411,9 +1436,15 @@ metaslab_should_condense(metaslab_t *msp)
 	entries = size / (MIN(size, SM_RUN_MAX));
 	segsz = entries * sizeof (uint64_t);
 
-	return (segsz <= space_map_length(msp->ms_sm) &&
-	    space_map_length(msp->ms_sm) >= (zfs_condense_pct *
-	    sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root)) / 100);
+	optimal_size = sizeof (uint64_t) * avl_numnodes(&msp->ms_tree->rt_root);
+	object_size = space_map_length(msp->ms_sm);
+
+	dmu_object_info_from_db(sm->sm_dbuf, &doi);
+	record_size = MAX(doi.doi_data_block_size, vdev_blocksize);
+
+	return (segsz <= object_size &&
+	    object_size >= (optimal_size * zfs_condense_pct / 100) &&
+	    object_size > zfs_metaslab_condense_block_threshold * record_size);
 }
 
 /*
author	delphij <delphij@FreeBSD.org>	2014-08-02 03:56:06 +0000
committer	delphij <delphij@FreeBSD.org>	2014-08-02 03:56:06 +0000
commit	6a949e106dc93d9fa177f78a454fab1d92b5c5e7 (patch)
tree	7404e6333ff3ecd4049cbff81119e02d6331e597 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
parent	4b7aa9ea8a7bce8b0826c0fb5337074a32500239 (diff)
download	FreeBSD-src-6a949e106dc93d9fa177f78a454fab1d92b5c5e7.zip FreeBSD-src-6a949e106dc93d9fa177f78a454fab1d92b5c5e7.tar.gz