diff options
author | avg <avg@FreeBSD.org> | 2015-07-06 10:40:51 +0000 |
---|---|---|
committer | avg <avg@FreeBSD.org> | 2015-07-06 10:40:51 +0000 |
commit | 998834455cb651366ad087f0f5206af8a1b7823f (patch) | |
tree | 87da8b87edf218e47d8fccb6042d5daf022337ef /sys/cddl | |
parent | afd8ff0c41f83c40a76a6ac07b15a729a8cbdc7d (diff) | |
download | FreeBSD-src-998834455cb651366ad087f0f5206af8a1b7823f.zip FreeBSD-src-998834455cb651366ad087f0f5206af8a1b7823f.tar.gz |
MFC r284593: MFV r284412: 5911 ZFS "hangs" while deleting file
illumos/illumos-gate@46e1baa6cf6d5432f5fd231bb588df8f9570c858
https://www.illumos.org/issues/5911
Sometimes ZFS appears to hang while deleting a file. It is actually
making slow progress at the file deletion, but other operations
(administrative and writes via the data path) "hang" until the file
removal completes, which can take a long time if the file has many
blocks. The deletion (or most of it) happens in a single txg, and the
sync thread spends most of its time reading indirect blocks...
Reviewed by: Bayard Bell <buffer.g.overflow@gmail.com>
Reviewed by: Alek Pinchuk <alek@nexenta.com>
Reviewed by: Simon Klinkert <simon.klinkert@gmail.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
Author: Matthew Ahrens <mahrens@delphix.com>
PR: 199775
Approved by: re(kib)
Diffstat (limited to 'sys/cddl')
5 files changed, 89 insertions, 33 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index ea3c688..2b55290 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. * Copyright (c) 2013, Joyent, Inc. All rights reserved. */ @@ -1298,6 +1298,16 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dbuf_dirty_record_t *dr, **drp; ASSERT(txg != 0); + + /* + * Due to our use of dn_nlevels below, this can only be called + * in open context, unless we are operating on the MOS. + * From syncing context, dn_nlevels may be different from the + * dn_nlevels used when dbuf was dirtied. + */ + ASSERT(db->db_objset == + dmu_objset_pool(db->db_objset)->dp_meta_objset || + txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); ASSERT(db->db_blkid != DMU_BONUS_BLKID); ASSERT0(db->db_level); ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -1320,11 +1330,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(db->db.db_size != 0); - /* - * Any space we accounted for in dp_dirty_* will be cleaned up by - * dsl_pool_sync(). This is relatively rare so the discrepancy - * is not a big deal. - */ + dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), + dr->dr_accounted, txg); *drp = dr->dr_next; @@ -1339,7 +1346,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) list_remove(&dr->dr_parent->dt.di.dr_children, dr); mutex_exit(&dr->dr_parent->dt.di.dr_mtx); } else if (db->db_blkid == DMU_SPILL_BLKID || - db->db_level+1 == dn->dn_nlevels) { + db->db_level + 1 == dn->dn_nlevels) { ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); @@ -1356,11 +1363,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db)); } - if (db->db_level != 0) { - mutex_destroy(&dr->dt.di.dr_mtx); - list_destroy(&dr->dt.di.dr_children); - } - kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); @@ -2318,7 +2320,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); - dbuf_sync_list(&dr->dt.di.dr_children, tx); + dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); mutex_exit(&dr->dt.di.dr_mtx); zio_nowait(zio); @@ -2464,7 +2466,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } void -dbuf_sync_list(list_t *list, dmu_tx_t *tx) +dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) { dbuf_dirty_record_t *dr; @@ -2481,6 +2483,10 @@ dbuf_sync_list(list_t *list, dmu_tx_t *tx) DMU_META_DNODE_OBJECT); break; } + if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { + VERIFY3U(dr->dr_dbuf->db_level, ==, level); + } list_remove(list, dr); if (dr->dr_dbuf->db_level > 0) dbuf_sync_indirect(dr, tx); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c index 1f42704..dff9fab 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright 2011 Nexenta Systems, Inc. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include <sys/dmu.h> @@ -686,7 +686,7 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) uint64_t ibyte = i << shift; err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); i = ibyte >> shift; - if (err == ESRCH) + if (err == ESRCH || i > end) break; if (err) { tx->tx_err = err; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c index 665165a..96deeb7 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode.c @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -1492,6 +1492,16 @@ out: rw_downgrade(&dn->dn_struct_rwlock); } +static void +dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG); + if (db != NULL) { + dmu_buf_will_dirty(&db->db, tx); + dbuf_rele(db, FTAG); + } +} + void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) { @@ -1612,27 +1622,67 @@ dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) nblks += 1; /* - * Dirty the first and last indirect blocks, as they (and/or their - * parents) will need to be written out if they were only - * partially freed. Interior indirect blocks will be themselves freed, - * by free_children(), so they need not be dirtied. Note that these - * interior blocks have already been prefetched by dmu_tx_hold_free(). + * Dirty all the indirect blocks in this range. Note that only + * the first and last indirect blocks can actually be written + * (if they were partially freed) -- they must be dirtied, even if + * they do not exist on disk yet. The interior blocks will + * be freed by free_children(), so they will not actually be written. + * Even though these interior blocks will not be written, we + * dirty them for two reasons: + * + * - It ensures that the indirect blocks remain in memory until + * syncing context. (They have already been prefetched by + * dmu_tx_hold_free(), so we don't have to worry about reading + * them serially here.) + * + * - The dirty space accounting will put pressure on the txg sync + * mechanism to begin syncing, and to delay transactions if there + * is a large amount of freeing. Even though these indirect + * blocks will not be written, we could need to write the same + * amount of space if we copy the freed BPs into deadlists. */ if (dn->dn_nlevels > 1) { uint64_t first, last; first = blkid >> epbs; - if (db = dbuf_hold_level(dn, 1, first, FTAG)) { - dmu_buf_will_dirty(&db->db, tx); - dbuf_rele(db, FTAG); - } + dnode_dirty_l1(dn, first, tx); if (trunc) last = dn->dn_maxblkid >> epbs; else last = (blkid + nblks - 1) >> epbs; - if (last > first && (db = dbuf_hold_level(dn, 1, last, FTAG))) { - dmu_buf_will_dirty(&db->db, tx); - dbuf_rele(db, FTAG); + if (last != first) + dnode_dirty_l1(dn, last, tx); + + int shift = dn->dn_datablkshift + dn->dn_indblkshift - + SPA_BLKPTRSHIFT; + for (uint64_t i = first + 1; i < last; i++) { + /* + * Set i to the blockid of the next non-hole + * level-1 indirect block at or after i. Note + * that dnode_next_offset() operates in terms of + * level-0-equivalent bytes. + */ + uint64_t ibyte = i << shift; + int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK, + &ibyte, 2, 1, 0); + i = ibyte >> shift; + if (i >= last) + break; + + /* + * Normally we should not see an error, either + * from dnode_next_offset() or dbuf_hold_level() + * (except for ESRCH from dnode_next_offset). + * If there is an i/o error, then when we read + * this block in syncing context, it will use + * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according + * to the "failmode" property. dnode_next_offset() + * doesn't have a flag to indicate MUSTSUCCEED. + */ + if (err != 0) + break; + + dnode_dirty_l1(dn, i, tx); } } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c index 63bfc94..418d318 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dnode_sync.c @@ -21,7 +21,7 @@ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. */ #include <sys/zfs_context.h> @@ -712,7 +712,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); } - dbuf_sync_list(list, tx); + dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx); if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { ASSERT3P(list_head(list), ==, NULL); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h index 8be8ed6b..319406a 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dbuf.h @@ -20,7 +20,7 @@ */ /* * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. - * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */ @@ -287,7 +287,7 @@ void dbuf_evict(dmu_buf_impl_t *db); void dbuf_setdirty(dmu_buf_impl_t *db, dmu_tx_t *tx); void dbuf_unoverride(dbuf_dirty_record_t *dr); -void dbuf_sync_list(list_t *list, dmu_tx_t *tx); +void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx); void dbuf_release_bp(dmu_buf_impl_t *db); void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, |