diff options
Diffstat (limited to 'sys')
11 files changed, 608 insertions, 204 deletions
diff --git a/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c b/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c index 51041a8..a400f82 100644 --- a/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c +++ b/sys/cddl/contrib/opensolaris/common/zfs/zpool_prop.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #include <sys/zio.h> @@ -87,6 +87,8 @@ zpool_prop_init(void) PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC"); zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0, PROP_READONLY, ZFS_TYPE_POOL, "<size>", "EXPANDSZ"); + zprop_register_number(ZPOOL_PROP_FRAGMENTATION, "fragmentation", 0, + PROP_READONLY, ZFS_TYPE_POOL, "<percent>", "FRAG"); zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY, ZFS_TYPE_POOL, "<size>", "CAP"); zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c index 45e2e12..5f3adef 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c @@ -32,6 +32,7 @@ #include <sys/vdev_impl.h> #include <sys/zio.h> #include <sys/spa_impl.h> +#include <sys/zfeature.h> SYSCTL_DECL(_vfs_zfs); SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); @@ -89,7 +90,7 @@ int zfs_metaslab_condense_block_threshold = 4; /* * The zfs_mg_noalloc_threshold defines which metaslab groups should * be eligible for allocation. The value is defined as a percentage of - * a free space. Metaslab groups that have more free space than + * free space. Metaslab groups that have more free space than * zfs_mg_noalloc_threshold are always eligible for allocations. Once * a metaslab group's free space is less than or equal to the * zfs_mg_noalloc_threshold the allocator will avoid allocating to that @@ -106,6 +107,23 @@ SYSCTL_INT(_vfs_zfs, OID_AUTO, mg_noalloc_threshold, CTLFLAG_RWTUN, " to make it eligible for allocation"); /* + * Metaslab groups are considered eligible for allocations if their + * fragmenation metric (measured as a percentage) is less than or equal to + * zfs_mg_fragmentation_threshold. If a metaslab group exceeds this threshold + * then it will be skipped unless all metaslab groups within the metaslab + * class have also crossed this threshold. + */ +int zfs_mg_fragmentation_threshold = 85; + +/* + * Allow metaslabs to keep their active state as long as their fragmentation + * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An + * active metaslab that exceeds this threshold will no longer keep its active + * status allowing better metaslabs to be selected. + */ +int zfs_metaslab_fragmentation_threshold = 70; + +/* * When set will load all metaslabs when pool is first opened. */ int metaslab_debug_load = 0; @@ -173,13 +191,6 @@ SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, unload_delay, CTLFLAG_RWTUN, "Number of TXGs that an unused metaslab can be kept in memory"); /* - * Should we be willing to write data to degraded vdevs? - */ -boolean_t zfs_write_to_degraded = B_FALSE; -SYSCTL_INT(_vfs_zfs, OID_AUTO, write_to_degraded, CTLFLAG_RWTUN, - &zfs_write_to_degraded, 0, "Allow writing data to degraded vdevs"); - -/* * Max number of metaslabs per group to preload. */ int metaslab_preload_limit = SPA_DVAS_PER_BP; @@ -196,13 +207,30 @@ SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_enabled, CTLFLAG_RWTUN, "Max number of metaslabs per group to preload"); /* - * Enable/disable additional weight factor for each metaslab. + * Enable/disable fragmentation weighting on metaslabs. + */ +boolean_t metaslab_fragmentation_factor_enabled = B_TRUE; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, fragmentation_factor_enabled, CTLFLAG_RWTUN, + &metaslab_fragmentation_factor_enabled, 0, + "Enable fragmentation weighting on metaslabs"); + +/* + * Enable/disable lba weighting (i.e. outer tracks are given preference). + */ +boolean_t metaslab_lba_weighting_enabled = B_TRUE; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, lba_weighting_enabled, CTLFLAG_RWTUN, + &metaslab_lba_weighting_enabled, 0, + "Enable LBA weighting (i.e. outer tracks are given preference)"); + +/* + * Enable/disable metaslab group biasing. */ -boolean_t metaslab_weight_factor_enable = B_FALSE; -SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, weight_factor_enable, CTLFLAG_RWTUN, - &metaslab_weight_factor_enable, 0, - "Enable additional weight factor for each metaslab"); +boolean_t metaslab_bias_enabled = B_TRUE; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, bias_enabled, CTLFLAG_RWTUN, + &metaslab_bias_enabled, 0, + "Enable metaslab group biasing"); +static uint64_t metaslab_fragmentation(metaslab_t *); /* * ========================================================================== @@ -322,6 +350,121 @@ metaslab_class_get_minblocksize(metaslab_class_t *mc) return (mc->mc_minblocksize); } +void +metaslab_class_histogram_verify(metaslab_class_t *mc) +{ + vdev_t *rvd = mc->mc_spa->spa_root_vdev; + uint64_t *mc_hist; + int i; + + if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) + return; + + mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, + KM_SLEEP); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + /* + * Skip any holes, uninitialized top-levels, or + * vdevs that are not in this metalab class. + */ + if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || + mg->mg_class != mc) { + continue; + } + + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + mc_hist[i] += mg->mg_histogram[i]; + } + + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); + + kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); +} + +/* + * Calculate the metaslab class's fragmentation metric. The metric + * is weighted based on the space contribution of each metaslab group. + * The return value will be a number between 0 and 100 (inclusive), or + * ZFS_FRAG_INVALID if the metric has not been set. See comment above the + * zfs_frag_table for more information about the metric. + */ +uint64_t +metaslab_class_fragmentation(metaslab_class_t *mc) +{ + vdev_t *rvd = mc->mc_spa->spa_root_vdev; + uint64_t fragmentation = 0; + + spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + /* + * Skip any holes, uninitialized top-levels, or + * vdevs that are not in this metalab class. + */ + if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || + mg->mg_class != mc) { + continue; + } + + /* + * If a metaslab group does not contain a fragmentation + * metric then just bail out. + */ + if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { + spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); + return (ZFS_FRAG_INVALID); + } + + /* + * Determine how much this metaslab_group is contributing + * to the overall pool fragmentation metric. + */ + fragmentation += mg->mg_fragmentation * + metaslab_group_get_space(mg); + } + fragmentation /= metaslab_class_get_space(mc); + + ASSERT3U(fragmentation, <=, 100); + spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); + return (fragmentation); +} + +/* + * Calculate the amount of expandable space that is available in + * this metaslab class. If a device is expanded then its expandable + * space will be the amount of allocatable space that is currently not + * part of this metaslab class. + */ +uint64_t +metaslab_class_expandable_space(metaslab_class_t *mc) +{ + vdev_t *rvd = mc->mc_spa->spa_root_vdev; + uint64_t space = 0; + + spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (tvd->vdev_ishole || tvd->vdev_ms_shift == 0 || + mg->mg_class != mc) { + continue; + } + + space += tvd->vdev_max_asize - tvd->vdev_asize; + } + spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); + return (space); +} + /* * ========================================================================== * Metaslab groups @@ -374,7 +517,15 @@ metaslab_group_alloc_update(metaslab_group_t *mg) mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / (vs->vs_space + 1); - mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold); + /* + * A metaslab group is considered allocatable if it has plenty + * of free space or is not heavily fragmented. We only take + * fragmentation into account if the metaslab group has a valid + * fragmentation metric (i.e. a value between 0 and 100). + */ + mg->mg_allocatable = (mg->mg_free_capacity > zfs_mg_noalloc_threshold && + (mg->mg_fragmentation == ZFS_FRAG_INVALID || + mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); /* * The mc_alloc_groups maintains a count of the number of @@ -395,6 +546,7 @@ metaslab_group_alloc_update(metaslab_group_t *mg) mc->mc_alloc_groups--; else if (!was_allocatable && mg->mg_allocatable) mc->mc_alloc_groups++; + mutex_exit(&mg->mg_lock); } @@ -485,6 +637,7 @@ metaslab_group_passivate(metaslab_group_t *mg) } taskq_wait(mg->mg_taskq); + metaslab_group_alloc_update(mg); mgprev = mg->mg_prev; mgnext = mg->mg_next; @@ -502,20 +655,113 @@ metaslab_group_passivate(metaslab_group_t *mg) metaslab_class_minblocksize_update(mc); } +uint64_t +metaslab_group_get_space(metaslab_group_t *mg) +{ + return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); +} + +void +metaslab_group_histogram_verify(metaslab_group_t *mg) +{ + uint64_t *mg_hist; + vdev_t *vd = mg->mg_vd; + uint64_t ashift = vd->vdev_ashift; + int i; + + if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) + return; + + mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, + KM_SLEEP); + + ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, + SPACE_MAP_HISTOGRAM_SIZE + ashift); + + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp->ms_sm == NULL) + continue; + + for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) + mg_hist[i + ashift] += + msp->ms_sm->sm_phys->smp_histogram[i]; + } + + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) + VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); + + kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); +} + static void -metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) +metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) +{ + metaslab_class_t *mc = mg->mg_class; + uint64_t ashift = mg->mg_vd->vdev_ashift; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + if (msp->ms_sm == NULL) + return; + + mutex_enter(&mg->mg_lock); + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + mg->mg_histogram[i + ashift] += + msp->ms_sm->sm_phys->smp_histogram[i]; + mc->mc_histogram[i + ashift] += + msp->ms_sm->sm_phys->smp_histogram[i]; + } + mutex_exit(&mg->mg_lock); +} + +void +metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) { + metaslab_class_t *mc = mg->mg_class; + uint64_t ashift = mg->mg_vd->vdev_ashift; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + if (msp->ms_sm == NULL) + return; + mutex_enter(&mg->mg_lock); + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + ASSERT3U(mg->mg_histogram[i + ashift], >=, + msp->ms_sm->sm_phys->smp_histogram[i]); + ASSERT3U(mc->mc_histogram[i + ashift], >=, + msp->ms_sm->sm_phys->smp_histogram[i]); + + mg->mg_histogram[i + ashift] -= + msp->ms_sm->sm_phys->smp_histogram[i]; + mc->mc_histogram[i + ashift] -= + msp->ms_sm->sm_phys->smp_histogram[i]; + } + mutex_exit(&mg->mg_lock); +} + +static void +metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) +{ ASSERT(msp->ms_group == NULL); + mutex_enter(&mg->mg_lock); msp->ms_group = mg; msp->ms_weight = 0; avl_add(&mg->mg_metaslab_tree, msp); mutex_exit(&mg->mg_lock); + + mutex_enter(&msp->ms_lock); + metaslab_group_histogram_add(mg, msp); + mutex_exit(&msp->ms_lock); } static void metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) { + mutex_enter(&msp->ms_lock); + metaslab_group_histogram_remove(mg, msp); + mutex_exit(&msp->ms_lock); + mutex_enter(&mg->mg_lock); ASSERT(msp->ms_group == mg); avl_remove(&mg->mg_metaslab_tree, msp); @@ -528,9 +774,9 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) { /* * Although in principle the weight can be any value, in - * practice we do not use values in the range [1, 510]. + * practice we do not use values in the range [1, 511]. */ - ASSERT(weight >= SPA_MINBLOCKSIZE-1 || weight == 0); + ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); ASSERT(MUTEX_HELD(&msp->ms_lock)); mutex_enter(&mg->mg_lock); @@ -542,9 +788,42 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) } /* + * Calculate the fragmentation for a given metaslab group. We can use + * a simple average here since all metaslabs within the group must have + * the same size. The return value will be a value between 0 and 100 + * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this + * group have a fragmentation metric. + */ +uint64_t +metaslab_group_fragmentation(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + uint64_t fragmentation = 0; + uint64_t valid_ms = 0; + + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp->ms_fragmentation == ZFS_FRAG_INVALID) + continue; + + valid_ms++; + fragmentation += msp->ms_fragmentation; + } + + if (valid_ms <= vd->vdev_ms_count / 2) + return (ZFS_FRAG_INVALID); + + fragmentation /= valid_ms; + ASSERT3U(fragmentation, <=, 100); + return (fragmentation); +} + +/* * Determine if a given metaslab group should skip allocations. A metaslab - * group should avoid allocations if its used capacity has crossed the - * zfs_mg_noalloc_threshold and there is at least one metaslab group + * group should avoid allocations if its free capacity is less than the + * zfs_mg_noalloc_threshold or its fragmentation metric is greater than + * zfs_mg_fragmentation_threshold and there is at least one metaslab group * that can still handle allocations. */ static boolean_t @@ -555,12 +834,19 @@ metaslab_group_allocatable(metaslab_group_t *mg) metaslab_class_t *mc = mg->mg_class; /* - * A metaslab group is considered allocatable if its free capacity - * is greater than the set value of zfs_mg_noalloc_threshold, it's - * associated with a slog, or there are no other metaslab groups - * with free capacity greater than zfs_mg_noalloc_threshold. + * We use two key metrics to determine if a metaslab group is + * considered allocatable -- free space and fragmentation. If + * the free space is greater than the free space threshold and + * the fragmentation is less than the fragmentation threshold then + * consider the group allocatable. There are two case when we will + * not consider these key metrics. The first is if the group is + * associated with a slog device and the second is if all groups + * in this metaslab class have already been consider ineligible + * for allocations. */ - return (mg->mg_free_capacity > zfs_mg_noalloc_threshold || + return ((mg->mg_free_capacity > zfs_mg_noalloc_threshold && + (mg->mg_fragmentation == ZFS_FRAG_INVALID || + mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)) || mc != spa_normal_class(spa) || mc->mc_alloc_groups == 0); } @@ -784,16 +1070,8 @@ metaslab_ff_alloc(metaslab_t *msp, uint64_t size) return (metaslab_block_picker(t, cursor, size, align)); } -/* ARGSUSED */ -static boolean_t -metaslab_ff_fragmented(metaslab_t *msp) -{ - return (B_TRUE); -} - static metaslab_ops_t metaslab_ff_ops = { - metaslab_ff_alloc, - metaslab_ff_fragmented + metaslab_ff_alloc }; /* @@ -840,23 +1118,8 @@ metaslab_df_alloc(metaslab_t *msp, uint64_t size) return (metaslab_block_picker(t, cursor, size, 1ULL)); } -static boolean_t -metaslab_df_fragmented(metaslab_t *msp) -{ - range_tree_t *rt = msp->ms_tree; - uint64_t max_size = metaslab_block_maxsize(msp); - int free_pct = range_tree_space(rt) * 100 / msp->ms_size; - - if (max_size >= metaslab_df_alloc_threshold && - free_pct >= metaslab_df_free_pct) - return (B_FALSE); - - return (B_TRUE); -} - static metaslab_ops_t metaslab_df_ops = { - metaslab_df_alloc, - metaslab_df_fragmented + metaslab_df_alloc }; /* @@ -899,15 +1162,8 @@ metaslab_cf_alloc(metaslab_t *msp, uint64_t size) return (offset); } -static boolean_t -metaslab_cf_fragmented(metaslab_t *msp) -{ - return (metaslab_block_maxsize(msp) < metaslab_min_alloc_size); -} - static metaslab_ops_t metaslab_cf_ops = { - metaslab_cf_alloc, - metaslab_cf_fragmented + metaslab_cf_alloc }; /* @@ -964,16 +1220,8 @@ metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) return (-1ULL); } -static boolean_t -metaslab_ndf_fragmented(metaslab_t *msp) -{ - return (metaslab_block_maxsize(msp) <= - (metaslab_min_alloc_size << metaslab_ndf_clump_shift)); -} - static metaslab_ops_t metaslab_ndf_ops = { - metaslab_ndf_alloc, - metaslab_ndf_fragmented + metaslab_ndf_alloc }; metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; @@ -1075,6 +1323,7 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg) msp->ms_tree = range_tree_create(&metaslab_rt_ops, msp, &msp->ms_lock); metaslab_group_add(mg, msp); + msp->ms_fragmentation = metaslab_fragmentation(msp); msp->ms_ops = mg->mg_class->mc_ops; /* @@ -1140,69 +1389,113 @@ metaslab_fini(metaslab_t *msp) kmem_free(msp, sizeof (metaslab_t)); } +#define FRAGMENTATION_TABLE_SIZE 17 + /* - * Apply a weighting factor based on the histogram information for this - * metaslab. The current weighting factor is somewhat arbitrary and requires - * additional investigation. The implementation provides a measure of - * "weighted" free space and gives a higher weighting for larger contiguous - * regions. The weighting factor is determined by counting the number of - * sm_shift sectors that exist in each region represented by the histogram. - * That value is then multiplied by the power of 2 exponent and the sm_shift - * value. + * This table defines a segment size based fragmentation metric that will + * allow each metaslab to derive its own fragmentation value. This is done + * by calculating the space in each bucket of the spacemap histogram and + * multiplying that by the fragmetation metric in this table. Doing + * this for all buckets and dividing it by the total amount of free + * space in this metaslab (i.e. the total free space in all buckets) gives + * us the fragmentation metric. This means that a high fragmentation metric + * equates to most of the free space being comprised of small segments. + * Conversely, if the metric is low, then most of the free space is in + * large segments. A 10% change in fragmentation equates to approximately + * double the number of segments. * - * For example, assume the 2^21 histogram bucket has 4 2MB regions and the - * metaslab has an sm_shift value of 9 (512B): - * - * 1) calculate the number of sm_shift sectors in the region: - * 2^21 / 2^9 = 2^12 = 4096 * 4 (number of regions) = 16384 - * 2) multiply by the power of 2 exponent and the sm_shift value: - * 16384 * 21 * 9 = 3096576 - * This value will be added to the weighting of the metaslab. + * This table defines 0% fragmented space using 16MB segments. Testing has + * shown that segments that are greater than or equal to 16MB do not suffer + * from drastic performance problems. Using this value, we derive the rest + * of the table. Since the fragmentation value is never stored on disk, it + * is possible to change these calculations in the future. + */ +int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { + 100, /* 512B */ + 100, /* 1K */ + 98, /* 2K */ + 95, /* 4K */ + 90, /* 8K */ + 80, /* 16K */ + 70, /* 32K */ + 60, /* 64K */ + 50, /* 128K */ + 40, /* 256K */ + 30, /* 512K */ + 20, /* 1M */ + 15, /* 2M */ + 10, /* 4M */ + 5, /* 8M */ + 0 /* 16M */ +}; + +/* + * Calclate the metaslab's fragmentation metric. A return value + * of ZFS_FRAG_INVALID means that the metaslab has not been upgraded and does + * not support this metric. Otherwise, the return value should be in the + * range [0, 100]. */ static uint64_t -metaslab_weight_factor(metaslab_t *msp) +metaslab_fragmentation(metaslab_t *msp) { - uint64_t factor = 0; - uint64_t sectors; - int i; + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + uint64_t fragmentation = 0; + uint64_t total = 0; + boolean_t feature_enabled = spa_feature_is_enabled(spa, + SPA_FEATURE_SPACEMAP_HISTOGRAM); + + if (!feature_enabled) + return (ZFS_FRAG_INVALID); /* - * A null space map means that the entire metaslab is free, - * calculate a weight factor that spans the entire size of the - * metaslab. + * A null space map means that the entire metaslab is free + * and thus is not fragmented. */ - if (msp->ms_sm == NULL) { + if (msp->ms_sm == NULL) + return (0); + + /* + * If this metaslab's space_map has not been upgraded, flag it + * so that we upgrade next time we encounter it. + */ + if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { + uint64_t txg = spa_syncing_txg(spa); vdev_t *vd = msp->ms_group->mg_vd; - i = highbit64(msp->ms_size) - 1; - sectors = msp->ms_size >> vd->vdev_ashift; - return (sectors * i * vd->vdev_ashift); + msp->ms_condense_wanted = B_TRUE; + vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); + spa_dbgmsg(spa, "txg %llu, requesting force condense: " + "msp %p, vd %p", txg, msp, vd); + return (ZFS_FRAG_INVALID); } - if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) - return (0); + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + uint64_t space = 0; + uint8_t shift = msp->ms_sm->sm_shift; + int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, + FRAGMENTATION_TABLE_SIZE - 1); - for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE(msp->ms_sm); i++) { if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) continue; - /* - * Determine the number of sm_shift sectors in the region - * indicated by the histogram. For example, given an - * sm_shift value of 9 (512 bytes) and i = 4 then we know - * that we're looking at an 8K region in the histogram - * (i.e. 9 + 4 = 13, 2^13 = 8192). To figure out the - * number of sm_shift sectors (512 bytes in this example), - * we would take 8192 / 512 = 16. Since the histogram - * is offset by sm_shift we can simply use the value of - * of i to calculate this (i.e. 2^i = 16 where i = 4). - */ - sectors = msp->ms_sm->sm_phys->smp_histogram[i] << i; - factor += (i + msp->ms_sm->sm_shift) * sectors; + space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); + total += space; + + ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); + fragmentation += space * zfs_frag_table[idx]; } - return (factor * msp->ms_sm->sm_shift); + + if (total > 0) + fragmentation /= total; + ASSERT3U(fragmentation, <=, 100); + return (fragmentation); } +/* + * Compute a weight -- a selection preference value -- for the given metaslab. + * This is based on the amount of free space, the level of fragmentation, + * the LBA range, and whether the metaslab is loaded. + */ static uint64_t metaslab_weight(metaslab_t *msp) { @@ -1226,6 +1519,29 @@ metaslab_weight(metaslab_t *msp) * The baseline weight is the metaslab's free space. */ space = msp->ms_size - space_map_allocated(msp->ms_sm); + + msp->ms_fragmentation = metaslab_fragmentation(msp); + if (metaslab_fragmentation_factor_enabled && + msp->ms_fragmentation != ZFS_FRAG_INVALID) { + /* + * Use the fragmentation information to inversely scale + * down the baseline weight. We need to ensure that we + * don't exclude this metaslab completely when it's 100% + * fragmented. To avoid this we reduce the fragmented value + * by 1. + */ + space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; + + /* + * If space < SPA_MINBLOCKSIZE, then we will not allocate from + * this metaslab again. The fragmentation metric may have + * decreased the space to something smaller than + * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE + * so that we can consume any remaining space. + */ + if (space > 0 && space < SPA_MINBLOCKSIZE) + space = SPA_MINBLOCKSIZE; + } weight = space; /* @@ -1237,19 +1553,19 @@ metaslab_weight(metaslab_t *msp) * In effect, this means that we'll select the metaslab with the most * free bandwidth rather than simply the one with the most free space. */ - weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; - ASSERT(weight >= space && weight <= 2 * space); - - msp->ms_factor = metaslab_weight_factor(msp); - if (metaslab_weight_factor_enable) - weight += msp->ms_factor; + if (metaslab_lba_weighting_enabled) { + weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; + ASSERT(weight >= space && weight <= 2 * space); + } - if (msp->ms_loaded && !msp->ms_ops->msop_fragmented(msp)) { - /* - * If this metaslab is one we're actively using, adjust its - * weight to make it preferable to any inactive metaslab so - * we'll polish it off. - */ + /* + * If this metaslab is one we're actively using, adjust its + * weight to make it preferable to any inactive metaslab so + * we'll polish it off. If the fragmentation on this metaslab + * has exceed our threshold, then don't mark it active. + */ + if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && + msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); } @@ -1334,9 +1650,16 @@ metaslab_group_preload(metaslab_group_t *mg) while (msp != NULL) { metaslab_t *msp_next = AVL_NEXT(t, msp); - /* If we have reached our preload limit then we're done */ - if (++m > metaslab_preload_limit) - break; + /* + * We preload only the maximum number of metaslabs specified + * by metaslab_preload_limit. If a metaslab is being forced + * to condense then we preload it too. This will ensure + * that force condensing happens in the next txg. + */ + if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { + msp = msp_next; + continue; + } /* * We must drop the metaslab group lock here to preserve @@ -1404,11 +1727,12 @@ metaslab_should_condense(metaslab_t *msp) /* * Use the ms_size_tree range tree, which is ordered by size, to - * obtain the largest segment in the free tree. If the tree is empty - * then we should condense the map. + * obtain the largest segment in the free tree. We always condense + * metaslabs that are empty and metaslabs for which a condense + * request has been made. */ rs = avl_last(&msp->ms_size_tree); - if (rs == NULL) + if (rs == NULL || msp->ms_condense_wanted) return (B_TRUE); /* @@ -1449,9 +1773,14 @@ metaslab_condense(metaslab_t *msp, uint64_t txg, dmu_tx_t *tx) ASSERT3U(spa_sync_pass(spa), ==, 1); ASSERT(msp->ms_loaded); + spa_dbgmsg(spa, "condensing: txg %llu, msp[%llu] %p, " - "smp size %llu, segments %lu", txg, msp->ms_id, msp, - space_map_length(msp->ms_sm), avl_numnodes(&msp->ms_tree->rt_root)); + "smp size %llu, segments %lu, forcing condense=%s", txg, + msp->ms_id, msp, space_map_length(msp->ms_sm), + avl_numnodes(&msp->ms_tree->rt_root), + msp->ms_condense_wanted ? "TRUE" : "FALSE"); + + msp->ms_condense_wanted = B_FALSE; /* * Create an range tree that is 100% allocated. We remove segments @@ -1544,8 +1873,14 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) ASSERT3P(*freetree, !=, NULL); ASSERT3P(*freed_tree, !=, NULL); + /* + * Normally, we don't want to process a metaslab if there + * are no allocations or frees to perform. However, if the metaslab + * is being forced to condense we need to let it through. + */ if (range_tree_space(alloctree) == 0 && - range_tree_space(*freetree) == 0) + range_tree_space(*freetree) == 0 && + !msp->ms_condense_wanted) return; /* @@ -1582,8 +1917,9 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) space_map_write(msp->ms_sm, *freetree, SM_FREE, tx); } - range_tree_vacate(alloctree, NULL, NULL); - + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + metaslab_group_histogram_remove(mg, msp); if (msp->ms_loaded) { /* * When the space map is loaded, we have an accruate @@ -1603,6 +1939,9 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) */ space_map_histogram_add(msp->ms_sm, *freetree, tx); } + metaslab_group_histogram_add(mg, msp); + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); /* * For sync pass 1, we avoid traversing this txg's free range tree @@ -1615,6 +1954,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) } else { range_tree_vacate(*freetree, range_tree_add, *freed_tree); } + range_tree_vacate(alloctree, NULL, NULL); ASSERT0(range_tree_space(msp->ms_alloctree[txg & TXG_MASK])); ASSERT0(range_tree_space(msp->ms_freetree[txg & TXG_MASK])); @@ -1725,13 +2065,13 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) metaslab_group_sort(mg, msp, metaslab_weight(msp)); mutex_exit(&msp->ms_lock); - } void metaslab_sync_reassess(metaslab_group_t *mg) { metaslab_group_alloc_update(mg); + mg->mg_fragmentation = metaslab_group_fragmentation(mg); /* * Preload the next potential metaslabs @@ -1993,9 +2333,7 @@ top: */ if ((vd->vdev_stat.vs_write_errors > 0 || vd->vdev_state < VDEV_STATE_HEALTHY) && - d == 0 && dshift == 3 && - !(zfs_write_to_degraded && vd->vdev_state == - VDEV_STATE_DEGRADED)) { + d == 0 && dshift == 3 && vd->vdev_children == 0) { all_zero = B_FALSE; goto next; } @@ -2020,7 +2358,7 @@ top: * over- or under-used relative to the pool, * and set an allocation bias to even it out. */ - if (mc->mc_aliquot == 0) { + if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { vdev_stat_t *vs = &vd->vdev_stat; int64_t vu, cu; @@ -2042,6 +2380,8 @@ top: */ mg->mg_bias = ((cu - vu) * (int64_t)mg->mg_aliquot) / 100; + } else if (!metaslab_bias_enabled) { + mg->mg_bias = 0; } if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c index 15e123e..22175e0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/range_tree.c @@ -81,6 +81,7 @@ range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) uint64_t size = rs->rs_end - rs->rs_start; int idx = highbit64(size) - 1; + ASSERT(size != 0); ASSERT3U(idx, <, sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); @@ -95,6 +96,7 @@ range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) uint64_t size = rs->rs_end - rs->rs_start; int idx = highbit64(size) - 1; + ASSERT(size != 0); ASSERT3U(idx, <, sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index 02b068e..a0e19ec 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -210,12 +210,10 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { vdev_t *rvd = spa->spa_root_vdev; dsl_pool_t *pool = spa->spa_dsl_pool; - uint64_t size; - uint64_t alloc; - uint64_t space; - uint64_t cap, version; + uint64_t size, alloc, cap, version; zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; + metaslab_class_t *mc = spa_normal_class(spa); ASSERT(MUTEX_HELD(&spa->spa_props_lock)); @@ -228,14 +226,10 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, size - alloc, src); - space = 0; - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *tvd = rvd->vdev_child[c]; - space += tvd->vdev_max_asize - tvd->vdev_asize; - } - spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, space, - src); - + spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, + metaslab_class_fragmentation(mc), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, + metaslab_class_expandable_space(mc), src); spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, (spa_mode(spa) == FREAD), src); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c index 629870b..d158b24 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/space_map.c @@ -202,10 +202,10 @@ space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx) * reached the maximum bucket size. Accumulate all ranges * larger than the max bucket size into the last bucket. */ - if (idx < SPACE_MAP_HISTOGRAM_SIZE(sm) - 1) { + if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { ASSERT3U(idx + sm->sm_shift, ==, i); idx++; - ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE(sm)); + ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); } } } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h index fda9fff..deefb19 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2011, 2014 by Delphix. All rights reserved. */ #ifndef _SYS_METASLAB_H @@ -38,23 +38,22 @@ extern "C" { typedef struct metaslab_ops { uint64_t (*msop_alloc)(metaslab_t *msp, uint64_t size); - boolean_t (*msop_fragmented)(metaslab_t *msp); } metaslab_ops_t; extern metaslab_ops_t *zfs_metaslab_ops; -metaslab_t *metaslab_init(metaslab_group_t *mg, uint64_t id, - uint64_t object, uint64_t txg); -void metaslab_fini(metaslab_t *msp); +metaslab_t *metaslab_init(metaslab_group_t *, uint64_t, + uint64_t, uint64_t); +void metaslab_fini(metaslab_t *); -void metaslab_load_wait(metaslab_t *msp); -int metaslab_load(metaslab_t *msp); -void metaslab_unload(metaslab_t *msp); +void metaslab_load_wait(metaslab_t *); +int metaslab_load(metaslab_t *); +void metaslab_unload(metaslab_t *); -void metaslab_sync(metaslab_t *msp, uint64_t txg); -void metaslab_sync_done(metaslab_t *msp, uint64_t txg); -void metaslab_sync_reassess(metaslab_group_t *mg); -uint64_t metaslab_block_maxsize(metaslab_t *msp); +void metaslab_sync(metaslab_t *, uint64_t); +void metaslab_sync_done(metaslab_t *, uint64_t); +void metaslab_sync_reassess(metaslab_group_t *); +uint64_t metaslab_block_maxsize(metaslab_t *); #define METASLAB_HINTBP_FAVOR 0x0 #define METASLAB_HINTBP_AVOID 0x1 @@ -62,29 +61,35 @@ uint64_t metaslab_block_maxsize(metaslab_t *msp); #define METASLAB_GANG_CHILD 0x4 #define METASLAB_GANG_AVOID 0x8 -int metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, - blkptr_t *bp, int ncopies, uint64_t txg, blkptr_t *hintbp, int flags); -void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now); -int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); -void metaslab_check_free(spa_t *spa, const blkptr_t *bp); +int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, + blkptr_t *, int, uint64_t, blkptr_t *, int); +void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); +int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); +void metaslab_check_free(spa_t *, const blkptr_t *); -metaslab_class_t *metaslab_class_create(spa_t *spa, metaslab_ops_t *ops); -void metaslab_class_destroy(metaslab_class_t *mc); -int metaslab_class_validate(metaslab_class_t *mc); +metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *); +void metaslab_class_destroy(metaslab_class_t *); +int metaslab_class_validate(metaslab_class_t *); +void metaslab_class_histogram_verify(metaslab_class_t *); +uint64_t metaslab_class_fragmentation(metaslab_class_t *); +uint64_t metaslab_class_expandable_space(metaslab_class_t *); -void metaslab_class_space_update(metaslab_class_t *mc, - int64_t alloc_delta, int64_t defer_delta, - int64_t space_delta, int64_t dspace_delta); -uint64_t metaslab_class_get_alloc(metaslab_class_t *mc); -uint64_t metaslab_class_get_space(metaslab_class_t *mc); -uint64_t metaslab_class_get_dspace(metaslab_class_t *mc); -uint64_t metaslab_class_get_deferred(metaslab_class_t *mc); +void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t, + int64_t, int64_t); +uint64_t metaslab_class_get_alloc(metaslab_class_t *); +uint64_t metaslab_class_get_space(metaslab_class_t *); +uint64_t metaslab_class_get_dspace(metaslab_class_t *); +uint64_t metaslab_class_get_deferred(metaslab_class_t *); uint64_t metaslab_class_get_minblocksize(metaslab_class_t *mc); -metaslab_group_t *metaslab_group_create(metaslab_class_t *mc, vdev_t *vd); -void metaslab_group_destroy(metaslab_group_t *mg); -void metaslab_group_activate(metaslab_group_t *mg); -void metaslab_group_passivate(metaslab_group_t *mg); +metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *); +void metaslab_group_destroy(metaslab_group_t *); +void metaslab_group_activate(metaslab_group_t *); +void metaslab_group_passivate(metaslab_group_t *); +uint64_t metaslab_group_get_space(metaslab_group_t *); +void metaslab_group_histogram_verify(metaslab_group_t *); +uint64_t metaslab_group_fragmentation(metaslab_group_t *); +void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); #ifdef __cplusplus } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h index 36d11d9..eb7c932 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h @@ -41,6 +41,23 @@ extern "C" { #endif +/* + * A metaslab class encompasses a category of allocatable top-level vdevs. + * Each top-level vdev is associated with a metaslab group which defines + * the allocatable region for that vdev. Examples of these categories include + * "normal" for data block allocations (i.e. main pool allocations) or "log" + * for allocations designated for intent log devices (i.e. slog devices). + * When a block allocation is requested from the SPA it is associated with a + * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging + * to the class can be used to satisfy that request. Allocations are done + * by traversing the metaslab groups that are linked off of the mc_rotor field. + * This rotor points to the next metaslab group where allocations will be + * attempted. Allocating a block is a 3 step process -- select the metaslab + * group, select the metaslab, and then allocate the block. The metaslab + * class defines the low-level block allocator that will be used as the + * final step in allocation. These allocators are pluggable allowing each class + * to use a block allocator that best suits that class. + */ struct metaslab_class { spa_t *mc_spa; metaslab_group_t *mc_rotor; @@ -52,8 +69,18 @@ struct metaslab_class { uint64_t mc_space; /* total space (alloc + free) */ uint64_t mc_dspace; /* total deflated space */ uint64_t mc_minblocksize; + uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE]; }; +/* + * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs) + * of a top-level vdev. They are linked togther to form a circular linked + * list and can belong to only one metaslab class. Metaslab groups may become + * ineligible for allocations for a number of reasons such as limited free + * space, fragmentation, or going offline. When this happens the allocator will + * simply find the next metaslab group in the linked list and attempt + * to allocate from that group instead. + */ struct metaslab_group { kmutex_t mg_lock; avl_tree_t mg_metaslab_tree; @@ -67,12 +94,14 @@ struct metaslab_group { taskq_t *mg_taskq; metaslab_group_t *mg_prev; metaslab_group_t *mg_next; + uint64_t mg_fragmentation; + uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; }; /* * This value defines the number of elements in the ms_lbas array. The value - * of 64 was chosen as it covers to cover all power of 2 buckets up to - * UINT64_MAX. This is the equivalent of highbit(UINT64_MAX). + * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. + * This is the equivalent of highbit(UINT64_MAX). */ #define MAX_LBAS 64 @@ -135,6 +164,7 @@ struct metaslab { uint64_t ms_id; uint64_t ms_start; uint64_t ms_size; + uint64_t ms_fragmentation; range_tree_t *ms_alloctree[TXG_SIZE]; range_tree_t *ms_freetree[TXG_SIZE]; @@ -142,12 +172,12 @@ struct metaslab { range_tree_t *ms_tree; boolean_t ms_condensing; /* condensing? */ + boolean_t ms_condense_wanted; boolean_t ms_loaded; boolean_t ms_loading; int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ - uint64_t ms_factor; uint64_t ms_access_txg; /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h index 3691803..67fa276 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/space_map.h @@ -24,7 +24,7 @@ */ /* - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #ifndef _SYS_SPACE_MAP_H @@ -44,9 +44,7 @@ extern "C" { * maintain backward compatibility. */ #define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t)) -#define SPACE_MAP_HISTOGRAM_SIZE(sm) \ - (sizeof ((sm)->sm_phys->smp_histogram) / \ - sizeof ((sm)->sm_phys->smp_histogram[0])) +#define SPACE_MAP_HISTOGRAM_SIZE 32 /* * The space_map_phys is the on-disk representation of the space map. @@ -68,7 +66,7 @@ typedef struct space_map_phys { * whose size is: * 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1) */ - uint64_t smp_histogram[32]; /* histogram of free space */ + uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE]; } space_map_phys_t; /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h index 6ef5246..6dc521b 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zfs_debug.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. */ #ifndef _SYS_ZFS_DEBUG_H @@ -52,13 +52,14 @@ extern int zfs_flags; extern boolean_t zfs_recover; extern boolean_t zfs_free_leak_on_eio; -#define ZFS_DEBUG_DPRINTF (1<<0) -#define ZFS_DEBUG_DBUF_VERIFY (1<<1) -#define ZFS_DEBUG_DNODE_VERIFY (1<<2) -#define ZFS_DEBUG_SNAPNAMES (1<<3) -#define ZFS_DEBUG_MODIFY (1<<4) -#define ZFS_DEBUG_SPA (1<<5) -#define ZFS_DEBUG_ZIO_FREE (1<<6) +#define ZFS_DEBUG_DPRINTF (1<<0) +#define ZFS_DEBUG_DBUF_VERIFY (1<<1) +#define ZFS_DEBUG_DNODE_VERIFY (1<<2) +#define ZFS_DEBUG_SNAPNAMES (1<<3) +#define ZFS_DEBUG_MODIFY (1<<4) +#define ZFS_DEBUG_SPA (1<<5) +#define ZFS_DEBUG_ZIO_FREE (1<<6) +#define ZFS_DEBUG_HISTOGRAM_VERIFY (1<<7) #ifdef ZFS_DEBUG extern void __dprintf(const char *file, const char *func, diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c index 9c26d2c..885c432 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -2257,6 +2257,11 @@ vdev_remove(vdev_t *vd, uint64_t txg) tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); if (vd->vdev_ms != NULL) { + metaslab_group_t *mg = vd->vdev_mg; + + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + for (int m = 0; m < vd->vdev_ms_count; m++) { metaslab_t *msp = vd->vdev_ms[m]; @@ -2264,12 +2269,27 @@ vdev_remove(vdev_t *vd, uint64_t txg) continue; mutex_enter(&msp->ms_lock); + /* + * If the metaslab was not loaded when the vdev + * was removed then the histogram accounting may + * not be accurate. Update the histogram information + * here so that we ensure that the metaslab group + * and metaslab class are up-to-date. + */ + metaslab_group_histogram_remove(mg, msp); + VERIFY0(space_map_allocated(msp->ms_sm)); space_map_free(msp->ms_sm, tx); space_map_close(msp->ms_sm); msp->ms_sm = NULL; mutex_exit(&msp->ms_lock); } + + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + ASSERT0(mg->mg_histogram[i]); + } if (vd->vdev_ms_array) { @@ -2729,7 +2749,10 @@ vdev_accessible(vdev_t *vd, zio_t *zio) void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) { - vdev_t *rvd = vd->vdev_spa->spa_root_vdev; + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); mutex_enter(&vd->vdev_stat_lock); bcopy(&vd->vdev_stat, vs, sizeof (*vs)); @@ -2743,7 +2766,8 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; vs->vs_logical_ashift = vd->vdev_logical_ashift; vs->vs_physical_ashift = vd->vdev_physical_ashift; - mutex_exit(&vd->vdev_stat_lock); + if (vd->vdev_aux == NULL && vd == vd->vdev_top) + vs->vs_fragmentation = vd->vdev_mg->mg_fragmentation; /* * If we're getting stats on the root vdev, aggregate the I/O counts @@ -2754,15 +2778,14 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) vdev_t *cvd = rvd->vdev_child[c]; vdev_stat_t *cvs = &cvd->vdev_stat; - mutex_enter(&vd->vdev_stat_lock); for (int t = 0; t < ZIO_TYPES; t++) { vs->vs_ops[t] += cvs->vs_ops[t]; vs->vs_bytes[t] += cvs->vs_bytes[t]; } cvs->vs_scan_removing = cvd->vdev_removing; - mutex_exit(&vd->vdev_stat_lock); } } + mutex_exit(&vd->vdev_stat_lock); } void diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h index b2288fb..87d0650 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h @@ -194,6 +194,7 @@ typedef enum { ZPOOL_PROP_COMMENT, ZPOOL_PROP_EXPANDSZ, ZPOOL_PROP_FREEING, + ZPOOL_PROP_FRAGMENTATION, ZPOOL_PROP_LEAKED, ZPOOL_NUM_PROPS } zpool_prop_t; @@ -599,6 +600,13 @@ typedef struct zpool_rewind_policy { #define SPA_MINDEVSIZE (64ULL << 20) /* + * Set if the fragmentation has not yet been calculated. This can happen + * because the space maps have not been upgraded or the histogram feature + * is not enabled. + */ +#define ZFS_FRAG_INVALID UINT64_MAX + +/* * The location of the pool configuration repository, shared between kernel and * userland. */ @@ -739,6 +747,7 @@ typedef struct vdev_stat { uint64_t vs_configured_ashift; /* TLV vdev_ashift */ uint64_t vs_logical_ashift; /* vdev_logical_ashift */ uint64_t vs_physical_ashift; /* vdev_physical_ashift */ + uint64_t vs_fragmentation; /* device fragmentation */ } vdev_stat_t; #define VDEV_STAT_VALID(field, uint64_t_field_count) \ ((uint64_t_field_count * sizeof(uint64_t)) >= \ |