summaryrefslogtreecommitdiffstats
path: root/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
diff options
context:
space:
mode:
authormm <mm@FreeBSD.org>2010-07-12 23:49:04 +0000
committermm <mm@FreeBSD.org>2010-07-12 23:49:04 +0000
commitb2946e89348042300795fce8f0b12a01250541df (patch)
tree528115d6014d608781cfcb91d5b3a5ba0cfcc892 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
parent1b4c9c446a5b4f260f236b1053bc42f683ef18cb (diff)
downloadFreeBSD-src-b2946e89348042300795fce8f0b12a01250541df.zip
FreeBSD-src-b2946e89348042300795fce8f0b12a01250541df.tar.gz
Merge ZFS version 15 and almost all OpenSolaris bugfixes referenced
in Solaris 10 updates 141445-09 and 142901-14. Detailed information: (OpenSolaris revisions and Bug IDs, Solaris 10 patch numbers) 7844:effed23820ae 6755435 zfs_open() and zfs_close() needs to use ZFS_ENTER/ZFS_VERIFY_ZP (141445-01) 7897:e520d8258820 6748436 inconsistent zpool.cache in boot_archive could panic a zfs root filesystem upon boot-up (141445-01) 7965:b795da521357 6740164 zpool attach can create an illegal root pool (141909-02) 8084:b811cc60d650 6769612 zpool_import() will continue to write to cachefile even if altroot is set (N/A) 8121:7fd09d4ebd9c 6757430 want an option for zdb to disable space map loading and leak tracking (141445-01) 8129:e4f45a0bfbb0 6542860 ASSERT: reason != VDEV_LABEL_REMOVE||vdev_inuse(vd, crtxg, reason, 0) (141445-01) 8188:fd00c0a81e80 6761100 want zdb option to select older uberblocks (141445-01) 8190:6eeea43ced42 6774886 zfs_setattr() won't allow ndmp to restore SUNWattr_rw (141445-01) 8225:59a9961c2aeb 6737463 panic while trying to write out config file if root pool import fails (141445-01) 8227:f7d7be9b1f56 6765294 Refactor replay (141445-01) 8228:51e9ca9ee3a5 6572357 libzfs should do more to avoid mnttab lookups (141909-01) 6572376 zfs_iter_filesystems and zfs_iter_snapshots get objset stats twice (141909-01) 8241:5a60f16123ba 6328632 zpool offline is a bit too conservative (141445-01) 6739487 ASSERT: txg <= spa_final_txg due to scrub/export race (141445-01) 6767129 ASSERT: cvd->vdev_isspare, in spa_vdev_detach() (141445-01) 6747698 checksum failures after offline -t / export / import / scrub (141445-01) 6745863 ZFS writes to disk after it has been offlined (141445-01) 6722540 50% slowdown on scrub/resilver with certain vdev configurations (141445-01) 6759999 resilver logic rewrites ditto blocks on both source and destination (141445-01) 6758107 I/O should never suspend during spa_load() (141445-01) 6776548 codereview(1) runs off the page when faced with multi-line comments (N/A) 6761406 AMD errata 91 workaround doesn't work on 64-bit systems (141445-01) 8242:e46e4b2f0a03 6770866 GRUB/ZFS should require physical path or devid, but not both (141445-01) 8269:03a7e9050cfd 6674216 "zfs share" doesn't work, but "zfs set sharenfs=on" does (141445-01) 6621164 $SRC/cmd/zfs/zfs_main.c seems to have a syntax error in the translation note (141445-01) 6635482 i18n problems in libzfs_dataset.c and zfs_main.c (141445-01) 6595194 "zfs get" VALUE column is as wide as NAME (141445-01) 6722991 vdev_disk.c: error checking for ddi_pathname_to_dev_t() must test for NODEV (141445-01) 6396518 ASSERT strings shouldn't be pre-processed (141445-01) 8274:846b39508aff 6713916 scrub/resilver needlessly decompress data (141445-01) 8343:655db2375fed 6739553 libzfs_status msgid table is out of sync (141445-01) 6784104 libzfs unfairly rejects numerical values greater than 2^63 (141445-01) 6784108 zfs_realloc() should not free original memory on failure (141445-01) 8525:e0e0e525d0f8 6788830 set large value to reservation cause core dump (141445-01) 6791064 want sysevents for ZFS scrub (141445-01) 6791066 need to be able to set cachefile on faulted pools (141445-01) 6791071 zpool_do_import() should not enable datasets on faulted pools (141445-01) 6792134 getting multiple properties on a faulted pool leads to confusion (141445-01) 8547:bcc7b46e5ff7 6792884 Vista clients cannot access .zfs (141445-01) 8632:36ef517870a3 6798384 It can take a village to raise a zio (141445-01) 8636:7e4ce9158df3 6551866 deadlock between zfs_write(), zfs_freesp(), and zfs_putapage() (141909-01) 6504953 zfs_getpage() misunderstands VOP_GETPAGE() interface (141909-01) 6702206 ZFS read/writer lock contention throttles sendfile() benchmark (141445-01) 6780491 Zone on a ZFS filesystem has poor fork/exec performance (141445-01) 6747596 assertion failed: DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), BP_IDENTITY(zio->io_bp))); (141445-01) 8692:692d4668b40d 6801507 ZFS read aggregation should not mind the gap (141445-01) 8697:e62d2612c14d 6633095 creating a filesystem with many properties set is slow (141445-01) 8768:dfecfdbb27ed 6775697 oracle crashes when overwriting after hitting quota on zfs (141909-01) 8811:f8deccf701cf 6790687 libzfs mnttab caching ignores external changes (141445-01) 6791101 memory leak from libzfs_mnttab_init (141445-01) 8845:91af0d9c0790 6800942 smb_session_create() incorrectly stores IP addresses (N/A) 6582163 Access Control List (ACL) for shares (141445-01) 6804954 smb_search - shortname field should be space padded following the NULL terminator (N/A) 6800184 Panic at smb_oplock_conflict+0x35() (N/A) 8876:59d2e67b4b65 6803822 Reboot after replacement of system disk in a ZFS mirror drops to grub> prompt (141445-01) 8924:5af812f84759 6789318 coredump when issue zdb -uuuu poolname/ (141445-01) 6790345 zdb -dddd -e poolname coredump (141445-01) 6797109 zdb: 'zdb -dddddd pool_name/fs_name inode' coredump if the file with inode was deleted (141445-01) 6797118 zdb: 'zdb -dddddd poolname inum' coredump if I miss the fs name (141445-01) 6803343 shareiscsi=on failed, iscsitgtd failed request to share (141445-01) 9030:243fd360d81f 6815893 hang mounting a dataset after booting into a new boot environment (141445-01) 9056:826e1858a846 6809691 'zpool create -f' no longer overwrites ufs infomation (141445-01) 9179:d8fbd96b79b3 6790064 zfs needs to determine uid and gid earlier in create process (141445-01) 9214:8d350e5d04aa 6604992 forced unmount + being in .zfs/snapshot/<snap1> = not happy (141909-01) 6810367 assertion failed: dvp->v_flag & VROOT, file: ../../common/fs/gfs.c, line: 426 (141909-01) 9229:e3f8b41e5db4 6807765 ztest_dsl_dataset_promote_busy needs to clean up after ENOSPC (141445-01) 9230:e4561e3eb1ef 6821169 offlining a device results in checksum errors (141445-01) 6821170 ZFS should not increment error stats for unavailable devices (141445-01) 6824006 need to increase issue and interrupt taskqs threads in zfs (141445-01) 9234:bffdc4fc05c4 6792139 recovering from a suspended pool needs some work (141445-01) 6794830 reboot command hangs on a failed zfs pool (141445-01) 9246:67c03c93c071 6824062 System panicked in zfs_mount due to NULL pointer dereference when running btts and svvs tests (141909-01) 9276:a8a7fc849933 6816124 System crash running zpool destroy on broken zpool (141445-03) 9355:09928982c591 6818183 zfs snapshot -r is slow due to set_snap_props() doing txg_wait_synced() for each new snapshot (141445-03) 9391:413d0661ef33 6710376 log device can show incorrect status when other parts of pool are degraded (141445-03) 9396:f41cf682d0d3 (part already merged) 6501037 want user/group quotas on ZFS (141445-03) 6827260 assertion failed in arc_read(): hdr == pbuf->b_hdr (141445-03) 6815592 panic: No such hold X on refcount Y from zfs_znode_move (141445-03) 6759986 zfs list shows temporary %clone when doing online zfs recv (141445-03) 9404:319573cd93f8 6774713 zfs ignores canmount=noauto when sharenfs property != off (141445-03) 9412:4aefd8704ce0 6717022 ZFS DMU needs zero-copy support (141445-03) 9425:e7ffacaec3a8 6799895 spa_add_spares() needs to be protected by config lock (141445-03) 6826466 want to post sysevents on hot spare activation (141445-03) 6826468 spa 'allowfaulted' needs some work (141445-03) 6826469 kernel support for storing vdev FRU information (141445-03) 6826470 skip posting checksum errors from DTL regions of leaf vdevs (141445-03) 6826471 I/O errors after device remove probe can confuse FMA (141445-03) 6826472 spares should enjoy some of the benefits of cache devices (141445-03) 9443:2a96d8478e95 6833711 gang leaders shouldn't have to be logical (141445-03) 9463:d0bd231c7518 6764124 want zdb to be able to checksum metadata blocks only (141445-03) 9465:8372081b8019 6830237 zfs panic in zfs_groupmember() (141445-03) 9466:1fdfd1fed9c4 6833162 phantom log device in zpool status (141445-03) 9469:4f68f041ddcd 6824968 add ZFS userquota support to rquotad (141445-03) 9470:6d827468d7b5 6834217 godfather I/O should reexecute (141445-03) 9480:fcff33da767f 6596237 Stop looking and start ganging (141909-02) 9493:9933d599bc93 6623978 lwb->lwb_buf != NULL, file ../../../uts/common/fs/zfs/zil.c, line 787, function zil_lwb_commit (141445-06) 9512:64cafcbcc337 6801810 Commit of aligned streaming rewrites to ZIL device causes unwanted disk reads (N/A) 9515:d3b739d9d043 6586537 async zio taskqs can block out userland commands (142901-09) 9554:787363635b6a 6836768 zfs_userspace() callback has no way to indicate failure (N/A) 9574:1eb6a6ab2c57 6838062 zfs panics when an error is encountered in space_map_load() (141909-02) 9583:b0696cd037cc 6794136 Panic BAD TRAP: type=e when importing degraded zraid pool. (141909-03) 9630:e25a03f552e0 6776104 "zfs import" deadlock between spa_unload() and spa_async_thread() (141445-06) 9653:a70048a304d1 6664765 Unable to remove files when using fat-zap and quota exceeded on ZFS filesystem (141445-06) 9688:127be1845343 6841321 zfs userspace / zfs get userused@ doesn't work on mounted snapshot (N/A) 6843069 zfs get userused@S-1-... doesn't work (N/A) 9873:8ddc892eca6e 6847229 assertion failed: refcount_count(&tx->tx_space_written) + delta <= tx->tx_space_towrite in dmu_tx.c (141445-06) 9904:d260bd3fd47c 6838344 kernel heap corruption detected on zil while stress testing (141445-06) 9951:a4895b3dd543 6844900 zfs_ioc_userspace_upgrade leaks (N/A) 10040:38b25aeeaf7a 6857012 zfs panics on zpool import (141445-06) 10000:241a51d8720c 6848242 zdb -e no longer works as expected (N/A) 10100:4a6965f6bef8 6856634 snv_117 not booting: zfs_parse_bootfs: error2 (141445-07) 10160:a45b03783d44 6861983 zfs should use new name <-> SID interfaces (N/A) 6862984 userquota commands can hang (141445-06) 10299:80845694147f 6696858 zfs receive of incremental replication stream can dereference NULL pointer and crash (N/A) 10302:a9e3d1987706 6696858 zfs receive of incremental replication stream can dereference NULL pointer and crash (fix lint) (N/A) 10575:2a8816c5173b (partial merge) 6882227 spa_async_remove() shouldn't do a full clear (142901-14) 10800:469478b180d9 6880764 fsync on zfs is broken if writes are greater than 32kb on a hard crash and no log attached (142901-09) 6793430 zdb -ivvvv assertion failure: bp->blk_cksum.zc_word[2] == dmu_objset_id(zilog->zl_os) (N/A) 10801:e0bf032e8673 (partial merge) 6822816 assertion failed: zap_remove_int(ds_next_clones_obj) returns ENOENT (142901-09) 10810:b6b161a6ae4a 6892298 buf->b_hdr->b_state != arc_anon, file: ../../common/fs/zfs/arc.c, line: 2849 (142901-09) 10890:499786962772 6807339 spurious checksum errors when replacing a vdev (142901-13) 11249:6c30f7dfc97b 6906110 bad trap panic in zil_replay_log_record (142901-13) 6906946 zfs replay isn't handling uid/gid correctly (142901-13) 11454:6e69bacc1a5a 6898245 suspended zpool should not cause rest of the zfs/zpool commands to hang (142901-10) 11546:42ea6be8961b (partial merge) 6833999 3-way deadlock in dsl_dataset_hold_ref() and dsl_sync_task_group_sync() (142901-09) Discussed with: pjd Approved by: delphij (mentor) Obtained from: OpenSolaris (multiple Bug IDs) MFC after: 2 months
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c261
1 files changed, 226 insertions, 35 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
index 47f8f5f..d216154 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c
@@ -19,7 +19,7 @@
* CDDL HEADER END
*/
/*
- * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
@@ -36,18 +36,35 @@ uint64_t metaslab_aliquot = 512ULL << 10;
uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */
/*
+ * Minimum size which forces the dynamic allocator to change
+ * it's allocation strategy. Once the space map cannot satisfy
+ * an allocation of this size then it switches to using more
+ * aggressive strategy (i.e search by size rather than offset).
+ */
+uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
+
+/*
+ * The minimum free space, in percent, which must be available
+ * in a space map to continue allocations in a first-fit fashion.
+ * Once the space_map's free space drops below this level we dynamically
+ * switch to using best-fit allocations.
+ */
+int metaslab_df_free_pct = 30;
+
+/*
* ==========================================================================
* Metaslab classes
* ==========================================================================
*/
metaslab_class_t *
-metaslab_class_create(void)
+metaslab_class_create(space_map_ops_t *ops)
{
metaslab_class_t *mc;
mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
mc->mc_rotor = NULL;
+ mc->mc_ops = ops;
return (mc);
}
@@ -202,30 +219,14 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
}
/*
- * ==========================================================================
- * The first-fit block allocator
- * ==========================================================================
+ * This is a helper function that can be used by the allocator to find
+ * a suitable block to allocate. This will search the specified AVL
+ * tree looking for a block that matches the specified criteria.
*/
-static void
-metaslab_ff_load(space_map_t *sm)
-{
- ASSERT(sm->sm_ppd == NULL);
- sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
-}
-
-static void
-metaslab_ff_unload(space_map_t *sm)
-{
- kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
- sm->sm_ppd = NULL;
-}
-
static uint64_t
-metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
+ uint64_t align)
{
- avl_tree_t *t = &sm->sm_root;
- uint64_t align = size & -size;
- uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
space_seg_t *ss, ssearch;
avl_index_t where;
@@ -254,7 +255,37 @@ metaslab_ff_alloc(space_map_t *sm, uint64_t size)
return (-1ULL);
*cursor = 0;
- return (metaslab_ff_alloc(sm, size));
+ return (metaslab_block_picker(t, cursor, size, align));
+}
+
+/*
+ * ==========================================================================
+ * The first-fit block allocator
+ * ==========================================================================
+ */
+static void
+metaslab_ff_load(space_map_t *sm)
+{
+ ASSERT(sm->sm_ppd == NULL);
+ sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+ sm->sm_pp_root = NULL;
+}
+
+static void
+metaslab_ff_unload(space_map_t *sm)
+{
+ kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
+ sm->sm_ppd = NULL;
+}
+
+static uint64_t
+metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+{
+ avl_tree_t *t = &sm->sm_root;
+ uint64_t align = size & -size;
+ uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+
+ return (metaslab_block_picker(t, cursor, size, align));
}
/* ARGSUSED */
@@ -276,9 +307,136 @@ static space_map_ops_t metaslab_ff_ops = {
metaslab_ff_unload,
metaslab_ff_alloc,
metaslab_ff_claim,
- metaslab_ff_free
+ metaslab_ff_free,
+ NULL /* maxsize */
+};
+
+/*
+ * Dynamic block allocator -
+ * Uses the first fit allocation scheme until space get low and then
+ * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
+ * and metaslab_df_free_pct to determine when to switch the allocation scheme.
+ */
+
+uint64_t
+metaslab_df_maxsize(space_map_t *sm)
+{
+ avl_tree_t *t = sm->sm_pp_root;
+ space_seg_t *ss;
+
+ if (t == NULL || (ss = avl_last(t)) == NULL)
+ return (0ULL);
+
+ return (ss->ss_end - ss->ss_start);
+}
+
+static int
+metaslab_df_seg_compare(const void *x1, const void *x2)
+{
+ const space_seg_t *s1 = x1;
+ const space_seg_t *s2 = x2;
+ uint64_t ss_size1 = s1->ss_end - s1->ss_start;
+ uint64_t ss_size2 = s2->ss_end - s2->ss_start;
+
+ if (ss_size1 < ss_size2)
+ return (-1);
+ if (ss_size1 > ss_size2)
+ return (1);
+
+ if (s1->ss_start < s2->ss_start)
+ return (-1);
+ if (s1->ss_start > s2->ss_start)
+ return (1);
+
+ return (0);
+}
+
+static void
+metaslab_df_load(space_map_t *sm)
+{
+ space_seg_t *ss;
+
+ ASSERT(sm->sm_ppd == NULL);
+ sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+
+ sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+ avl_create(sm->sm_pp_root, metaslab_df_seg_compare,
+ sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
+
+ for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+ avl_add(sm->sm_pp_root, ss);
+}
+
+static void
+metaslab_df_unload(space_map_t *sm)
+{
+ void *cookie = NULL;
+
+ kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
+ sm->sm_ppd = NULL;
+
+ while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
+ /* tear down the tree */
+ }
+
+ avl_destroy(sm->sm_pp_root);
+ kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
+ sm->sm_pp_root = NULL;
+}
+
+static uint64_t
+metaslab_df_alloc(space_map_t *sm, uint64_t size)
+{
+ avl_tree_t *t = &sm->sm_root;
+ uint64_t align = size & -size;
+ uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+ uint64_t max_size = metaslab_df_maxsize(sm);
+ int free_pct = sm->sm_space * 100 / sm->sm_size;
+
+ ASSERT(MUTEX_HELD(sm->sm_lock));
+ ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+
+ if (max_size < size)
+ return (-1ULL);
+
+ /*
+ * If we're running low on space switch to using the size
+ * sorted AVL tree (best-fit).
+ */
+ if (max_size < metaslab_df_alloc_threshold ||
+ free_pct < metaslab_df_free_pct) {
+ t = sm->sm_pp_root;
+ *cursor = 0;
+ }
+
+ return (metaslab_block_picker(t, cursor, size, 1ULL));
+}
+
+/* ARGSUSED */
+static void
+metaslab_df_claim(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ /* No need to update cursor */
+}
+
+/* ARGSUSED */
+static void
+metaslab_df_free(space_map_t *sm, uint64_t start, uint64_t size)
+{
+ /* No need to update cursor */
+}
+
+static space_map_ops_t metaslab_df_ops = {
+ metaslab_df_load,
+ metaslab_df_unload,
+ metaslab_df_alloc,
+ metaslab_df_claim,
+ metaslab_df_free,
+ metaslab_df_maxsize
};
+space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+
/*
* ==========================================================================
* Metaslabs
@@ -414,20 +572,28 @@ metaslab_weight(metaslab_t *msp)
}
static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
{
space_map_t *sm = &msp->ms_map;
+ space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
ASSERT(MUTEX_HELD(&msp->ms_lock));
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
- int error = space_map_load(sm, &metaslab_ff_ops,
- SM_FREE, &msp->ms_smo,
+ int error = space_map_load(sm, sm_ops, SM_FREE, &msp->ms_smo,
msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
if (error) {
metaslab_group_sort(msp->ms_group, msp, 0);
return (error);
}
+
+ /*
+ * If we were able to load the map then make sure
+ * that this map is still able to satisfy our request.
+ */
+ if (msp->ms_weight < size)
+ return (ENOSPC);
+
metaslab_group_sort(msp->ms_group, msp,
msp->ms_weight | activation_weight);
}
@@ -636,11 +802,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
int i;
activation_weight = METASLAB_WEIGHT_PRIMARY;
- for (i = 0; i < d; i++)
- if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id)
+ for (i = 0; i < d; i++) {
+ if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
activation_weight = METASLAB_WEIGHT_SECONDARY;
+ break;
+ }
+ }
for (;;) {
+ boolean_t was_active;
+
mutex_enter(&mg->mg_lock);
for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
if (msp->ms_weight < size) {
@@ -648,6 +819,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
return (-1ULL);
}
+ was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
if (activation_weight == METASLAB_WEIGHT_PRIMARY)
break;
@@ -673,7 +845,9 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
* another thread may have changed the weight while we
* were blocked on the metaslab lock.
*/
- if (msp->ms_weight < size) {
+ if (msp->ms_weight < size || (was_active &&
+ !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
+ activation_weight == METASLAB_WEIGHT_PRIMARY)) {
mutex_exit(&msp->ms_lock);
continue;
}
@@ -686,7 +860,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
continue;
}
- if (metaslab_activate(msp, activation_weight) != 0) {
+ if (metaslab_activate(msp, activation_weight, size) != 0) {
mutex_exit(&msp->ms_lock);
continue;
}
@@ -720,6 +894,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
vdev_t *vd;
int dshift = 3;
int all_zero;
+ int zio_lock = B_FALSE;
+ boolean_t allocatable;
uint64_t offset = -1ULL;
uint64_t asize;
uint64_t distance;
@@ -778,11 +954,20 @@ top:
all_zero = B_TRUE;
do {
vd = mg->mg_vd;
+
/*
* Don't allocate from faulted devices.
*/
- if (!vdev_allocatable(vd))
+ if (zio_lock) {
+ spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
+ allocatable = vdev_allocatable(vd);
+ spa_config_exit(spa, SCL_ZIO, FTAG);
+ } else {
+ allocatable = vdev_allocatable(vd);
+ }
+ if (!allocatable)
goto next;
+
/*
* Avoid writing single-copy data to a failing vdev
*/
@@ -858,6 +1043,12 @@ next:
goto top;
}
+ if (!allocatable && !zio_lock) {
+ dshift = 3;
+ zio_lock = B_TRUE;
+ goto top;
+ }
+
bzero(&dva[d], sizeof (dva_t));
return (ENOSPC);
@@ -938,7 +1129,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
mutex_enter(&msp->ms_lock);
- error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
+ error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
if (error || txg == 0) { /* txg == 0 indicates dry run */
mutex_exit(&msp->ms_lock);
return (error);
@@ -946,7 +1137,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
space_map_claim(&msp->ms_map, offset, size);
- if (spa_mode & FWRITE) { /* don't dirty if we're zdb(1M) */
+ if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */
if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
vdev_dirty(vd, VDD_METASLAB, msp, txg);
space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
OpenPOWER on IntegriCloud