diff options
author | pjd <pjd@FreeBSD.org> | 2008-11-17 20:49:29 +0000 |
---|---|---|
committer | pjd <pjd@FreeBSD.org> | 2008-11-17 20:49:29 +0000 |
commit | bbe899b96e388a8b82439f81ed3707e0d9c6070d (patch) | |
tree | 81b89fa4ac6467771d5aa291a97f4665981a6108 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c | |
parent | d2f579595c362ce27b4d87e2c40e1c4e09b929e3 (diff) | |
download | FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.zip FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.tar.gz |
Update ZFS from version 6 to 13 and bring some FreeBSD-specific changes.
This bring huge amount of changes, I'll enumerate only user-visible changes:
- Delegated Administration
Allows regular users to perform ZFS operations, like file system
creation, snapshot creation, etc.
- L2ARC
Level 2 cache for ZFS - allows to use additional disks for cache.
Huge performance improvements mostly for random read of mostly
static content.
- slog
Allow to use additional disks for ZFS Intent Log to speed up
operations like fsync(2).
- vfs.zfs.super_owner
Allows regular users to perform privileged operations on files stored
on ZFS file systems owned by him. Very careful with this one.
- chflags(2)
Not all the flags are supported. This still needs work.
- ZFSBoot
Support to boot off of ZFS pool. Not finished, AFAIK.
Submitted by: dfr
- Snapshot properties
- New failure modes
Before if write requested failed, system paniced. Now one
can select from one of three failure modes:
- panic - panic on write error
- wait - wait for disk to reappear
- continue - serve read requests if possible, block write requests
- Refquota, refreservation properties
Just quota and reservation properties, but don't count space consumed
by children file systems, clones and snapshots.
- Sparse volumes
ZVOLs that don't reserve space in the pool.
- External attributes
Compatible with extattr(2).
- NFSv4-ACLs
Not sure about the status, might not be complete yet.
Submitted by: trasz
- Creation-time properties
- Regression tests for zpool(8) command.
Obtained from: OpenSolaris
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c')
-rw-r--r-- | sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c | 405 |
1 files changed, 226 insertions, 179 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c index 94c6308..2494c1e 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include <sys/zfs_context.h> #include <sys/dmu.h> #include <sys/dmu_impl.h> @@ -39,17 +37,10 @@ static void dbuf_destroy(dmu_buf_impl_t *db); static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); -static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, - int compress, dmu_tx_t *tx); +static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); static arc_done_func_t dbuf_write_ready; static arc_done_func_t dbuf_write_done; -int zfs_mdcomp_disable = 0; -SYSCTL_DECL(_vfs_zfs); -TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable); -SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN, - &zfs_mdcomp_disable, 0, "Disable metadata compression"); - /* * Global data structures and functions for the dbuf cache. */ @@ -311,7 +302,7 @@ dbuf_verify(dmu_buf_impl_t *db) } if (db->db_blkid == DB_BONUS_BLKID) { ASSERT(dn != NULL); - ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID); } else { ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); @@ -460,45 +451,45 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) static void dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) { - blkptr_t *bp; + dnode_t *dn = db->db_dnode; zbookmark_t zb; uint32_t aflags = ARC_NOWAIT; + arc_buf_t *pbuf; ASSERT(!refcount_is_zero(&db->db_holds)); /* We need the struct_rwlock to prevent db_blkptr from changing. */ - ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock)); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); ASSERT(MUTEX_HELD(&db->db_mtx)); ASSERT(db->db_state == DB_UNCACHED); ASSERT(db->db_buf == NULL); if (db->db_blkid == DB_BONUS_BLKID) { - ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size); + int bonuslen = dn->dn_bonuslen; + + ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); - if (db->db.db_size < DN_MAX_BONUSLEN) + arc_space_consume(DN_MAX_BONUSLEN); + if (bonuslen < DN_MAX_BONUSLEN) bzero(db->db.db_data, DN_MAX_BONUSLEN); - bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data, - db->db.db_size); + bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, + bonuslen); dbuf_update_data(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); return; } - if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid)) - bp = NULL; - else - bp = db->db_blkptr; - - if (bp == NULL) - dprintf_dbuf(db, "blkptr: %s\n", "NULL"); - else - dprintf_dbuf_bp(db, bp, "%s", "blkptr:"); - - if (bp == NULL || BP_IS_HOLE(bp)) { + /* + * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() + * processes the delete record and clears the bp while we are waiting + * for the dn_mtx (resulting in a "no" from block_freed). + */ + if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) || + (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) || + BP_IS_HOLE(db->db_blkptr)))) { arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); - ASSERT(bp == NULL || BP_IS_HOLE(bp)); - dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa, + dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa, db->db.db_size, db, type)); bzero(db->db.db_data, db->db.db_size); db->db_state = DB_CACHED; @@ -510,6 +501,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) db->db_state = DB_READ; mutex_exit(&db->db_mtx); + if (DBUF_IS_L2CACHEABLE(db)) + aflags |= ARC_L2CACHE; + zb.zb_objset = db->db_objset->os_dsl_dataset ? db->db_objset->os_dsl_dataset->ds_object : 0; zb.zb_object = db->db.db_object; @@ -518,10 +512,13 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) dbuf_add_ref(db, NULL); /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ - ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES); - (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp, - db->db_level > 0 ? byteswap_uint64_array : - dmu_ot[db->db_dnode->dn_type].ot_byteswap, + + if (db->db_parent) + pbuf = db->db_parent->db_buf; + else + pbuf = db->db_objset->os_phys_buf; + + (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf, dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED, &aflags, &zb); @@ -546,7 +543,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER); prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID && - (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL; + (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL && + DBUF_IS_CACHEABLE(db); mutex_enter(&db->db_mtx); if (db->db_state == DB_CACHED) { @@ -661,6 +659,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) if (db->db_blkid == DB_BONUS_BLKID) { /* Note that the data bufs here are zio_bufs */ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); + arc_space_consume(DN_MAX_BONUSLEN); bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; @@ -690,7 +689,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) /* free this block */ if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) { /* XXX can get silent EIO here */ - (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa, + (void) dsl_free(NULL, + spa_get_dsl(db->db_dnode->dn_objset->os_spa), txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT); } dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; @@ -705,22 +705,50 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) arc_release(dr->dt.dl.dr_data, db); } +/* + * Evict (if its unreferenced) or clear (if its referenced) any level-0 + * data blocks in the free range, so that any future readers will find + * empty blocks. Also, if we happen accross any level-1 dbufs in the + * range that have not already been marked dirty, mark them dirty so + * they stay in memory. + */ void -dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) +dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx) { dmu_buf_impl_t *db, *db_next; uint64_t txg = tx->tx_txg; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + uint64_t first_l1 = start >> epbs; + uint64_t last_l1 = end >> epbs; - dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks); + if (end > dn->dn_maxblkid) { + end = dn->dn_maxblkid; + last_l1 = end >> epbs; + } + dprintf_dnode(dn, "start=%llu end=%llu\n", start, end); mutex_enter(&dn->dn_dbufs_mtx); for (db = list_head(&dn->dn_dbufs); db; db = db_next) { db_next = list_next(&dn->dn_dbufs, db); ASSERT(db->db_blkid != DB_BONUS_BLKID); + + if (db->db_level == 1 && + db->db_blkid >= first_l1 && db->db_blkid <= last_l1) { + mutex_enter(&db->db_mtx); + if (db->db_last_dirty && + db->db_last_dirty->dr_txg < txg) { + dbuf_add_ref(db, FTAG); + mutex_exit(&db->db_mtx); + dbuf_will_dirty(db, tx); + dbuf_rele(db, FTAG); + } else { + mutex_exit(&db->db_mtx); + } + } + if (db->db_level != 0) continue; dprintf_dbuf(db, "found buf %s\n", ""); - if (db->db_blkid < blkid || - db->db_blkid >= blkid+nblks) + if (db->db_blkid < start || db->db_blkid > end) continue; /* found a level 0 buffer in the range */ @@ -783,31 +811,28 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx) } static int -dbuf_new_block(dmu_buf_impl_t *db) +dbuf_block_freeable(dmu_buf_impl_t *db) { dsl_dataset_t *ds = db->db_objset->os_dsl_dataset; uint64_t birth_txg = 0; - /* Don't count meta-objects */ - if (ds == NULL) - return (FALSE); - /* * We don't need any locking to protect db_blkptr: * If it's syncing, then db_last_dirty will be set * so we'll ignore db_blkptr. */ ASSERT(MUTEX_HELD(&db->db_mtx)); - /* If we have been dirtied since the last snapshot, its not new */ if (db->db_last_dirty) birth_txg = db->db_last_dirty->dr_txg; else if (db->db_blkptr) birth_txg = db->db_blkptr->blk_birth; + /* If we don't exist or are in a snapshot, we can't be freed */ if (birth_txg) - return (!dsl_dataset_block_freeable(ds, birth_txg)); + return (ds == NULL || + dsl_dataset_block_freeable(ds, birth_txg)); else - return (TRUE); + return (FALSE); } void @@ -865,6 +890,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) objset_impl_t *os = dn->dn_objset; dbuf_dirty_record_t **drp, *dr; int drop_struct_lock = FALSE; + boolean_t do_free_accounting = B_FALSE; int txgoff = tx->tx_txg & TXG_MASK; ASSERT(tx->tx_txg != 0); @@ -922,20 +948,20 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) drp = &db->db_last_dirty; ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg || db->db.db_object == DMU_META_DNODE_OBJECT); - while (*drp && (*drp)->dr_txg > tx->tx_txg) - drp = &(*drp)->dr_next; - if (*drp && (*drp)->dr_txg == tx->tx_txg) { + while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg) + drp = &dr->dr_next; + if (dr && dr->dr_txg == tx->tx_txg) { if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) { /* * If this buffer has already been written out, * we now need to reset its state. */ - dbuf_unoverride(*drp); + dbuf_unoverride(dr); if (db->db.db_object != DMU_META_DNODE_OBJECT) arc_buf_thaw(db->db_buf); } mutex_exit(&db->db_mtx); - return (*drp); + return (dr); } /* @@ -966,6 +992,18 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + if (db->db_blkid != DB_BONUS_BLKID) { + /* + * Update the accounting. + * Note: we delay "free accounting" until after we drop + * the db_mtx. This keeps us from grabbing other locks + * (and possibly deadlocking) in bp_get_dasize() while + * also holding the db_mtx. + */ + dnode_willuse_space(dn, db->db.db_size, tx); + do_free_accounting = dbuf_block_freeable(db); + } + /* * If this buffer is dirty in an old transaction group we need * to make a copy of it so that the changes we make in this @@ -1015,25 +1053,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) db->db_freed_in_flight = FALSE; } - if (db->db_blkid != DB_BONUS_BLKID) { - /* - * Update the accounting. - */ - if (!dbuf_new_block(db) && db->db_blkptr) { - /* - * This is only a guess -- if the dbuf is dirty - * in a previous txg, we don't know how much - * space it will use on disk yet. We should - * really have the struct_rwlock to access - * db_blkptr, but since this is just a guess, - * it's OK if we get an odd answer. - */ - dnode_willuse_space(dn, - -bp_get_dasize(os->os_spa, db->db_blkptr), tx); - } - dnode_willuse_space(dn, db->db.db_size, tx); - } - /* * This buffer is now part of this txg */ @@ -1050,11 +1069,19 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) mutex_exit(&dn->dn_mtx); dnode_setdirty(dn, tx); return (dr); - } - - if (db->db_level == 0) { - dnode_new_blkid(dn, db->db_blkid, tx); - ASSERT(dn->dn_maxblkid >= db->db_blkid); + } else if (do_free_accounting) { + blkptr_t *bp = db->db_blkptr; + int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? + bp_get_dasize(os->os_spa, bp) : db->db.db_size; + /* + * This is only a guess -- if the dbuf is dirty + * in a previous txg, we don't know how much + * space it will use on disk yet. We should + * really have the struct_rwlock to access + * db_blkptr, but since this is just a guess, + * it's OK if we get an odd answer. + */ + dnode_willuse_space(dn, -willfree, tx); } if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { @@ -1062,6 +1089,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) drop_struct_lock = TRUE; } + if (db->db_level == 0) { + dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock); + ASSERT(dn->dn_maxblkid >= db->db_blkid); + } + if (db->db_level+1 < dn->dn_nlevels) { dmu_buf_impl_t *parent = db->db_parent; dbuf_dirty_record_t *di; @@ -1115,7 +1147,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn = db->db_dnode; uint64_t txg = tx->tx_txg; - dbuf_dirty_record_t *dr; + dbuf_dirty_record_t *dr, **drp; ASSERT(txg != 0); ASSERT(db->db_blkid != DB_BONUS_BLKID); @@ -1125,7 +1157,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * If this buffer is not dirty, we're done. */ - for (dr = db->db_last_dirty; dr; dr = dr->dr_next) + for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next) if (dr->dr_txg <= txg) break; if (dr == NULL || dr->dr_txg < txg) { @@ -1155,14 +1187,14 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* XXX would be nice to fix up dn_towrite_space[] */ - db->db_last_dirty = dr->dr_next; + *drp = dr->dr_next; if (dr->dr_parent) { mutex_enter(&dr->dr_parent->dt.di.dr_mtx); list_remove(&dr->dr_parent->dt.di.dr_children, dr); mutex_exit(&dr->dr_parent->dt.di.dr_mtx); } else if (db->db_level+1 == dn->dn_nlevels) { - ASSERT3P(db->db_parent, ==, dn->dn_dbuf); + ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); mutex_enter(&dn->dn_mtx); list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); mutex_exit(&dn->dn_mtx); @@ -1178,8 +1210,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } else { ASSERT(db->db_buf != NULL); ASSERT(list_head(&dr->dt.di.dr_children) == NULL); - list_destroy(&dr->dt.di.dr_children); mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); @@ -1204,7 +1236,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) void dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { - int rf = DB_RF_MUST_SUCCEED; + int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH; ASSERT(tx->tx_txg != 0); ASSERT(!refcount_is_zero(&db->db_holds)); @@ -1282,8 +1314,10 @@ dbuf_clear(dmu_buf_impl_t *db) if (db->db_state == DB_CACHED) { ASSERT(db->db.db_data != NULL); - if (db->db_blkid == DB_BONUS_BLKID) + if (db->db_blkid == DB_BONUS_BLKID) { zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN); + } db->db.db_data = NULL; db->db_state = DB_UNCACHED; } @@ -1297,6 +1331,7 @@ dbuf_clear(dmu_buf_impl_t *db) if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) { list_remove(&dn->dn_dbufs, db); dnode_rele(dn, db); + db->db_dnode = NULL; } if (db->db_buf) @@ -1397,10 +1432,13 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, if (blkid == DB_BONUS_BLKID) { ASSERT3P(parent, ==, dn->dn_dbuf); - db->db.db_size = dn->dn_bonuslen; + db->db.db_size = DN_MAX_BONUSLEN - + (dn->dn_nblkptr-1) * sizeof (blkptr_t); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); db->db.db_offset = DB_BONUS_BLKID; db->db_state = DB_UNCACHED; /* the bonus dbuf is not placed in the hash table */ + arc_space_consume(sizeof (dmu_buf_impl_t)); return (db); } else { int blocksize = @@ -1427,6 +1465,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, list_insert_head(&dn->dn_dbufs, db); db->db_state = DB_UNCACHED; mutex_exit(&dn->dn_dbufs_mtx); + arc_space_consume(sizeof (dmu_buf_impl_t)); if (parent && parent != dn->dn_dbuf) dbuf_add_ref(parent, db); @@ -1469,31 +1508,33 @@ dbuf_destroy(dmu_buf_impl_t *db) ASSERT(refcount_is_zero(&db->db_holds)); if (db->db_blkid != DB_BONUS_BLKID) { - dnode_t *dn = db->db_dnode; - /* * If this dbuf is still on the dn_dbufs list, * remove it from that list. */ - if (list_link_active(&db->db_link)) { + if (db->db_dnode) { + dnode_t *dn = db->db_dnode; + mutex_enter(&dn->dn_dbufs_mtx); list_remove(&dn->dn_dbufs, db); mutex_exit(&dn->dn_dbufs_mtx); dnode_rele(dn, db); + db->db_dnode = NULL; } dbuf_hash_remove(db); } db->db_parent = NULL; - db->db_dnode = NULL; db->db_buf = NULL; + ASSERT(!list_link_active(&db->db_link)); ASSERT(db->db.db_data == NULL); ASSERT(db->db_hash_next == NULL); ASSERT(db->db_blkptr == NULL); ASSERT(db->db_data_pending == NULL); kmem_cache_free(dbuf_cache, db); + arc_space_return(sizeof (dmu_buf_impl_t)); } void @@ -1525,6 +1566,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { if (bp && !BP_IS_HOLE(bp)) { + arc_buf_t *pbuf; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; zbookmark_t zb; zb.zb_objset = dn->dn_objset->os_dsl_dataset ? @@ -1533,9 +1575,13 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) zb.zb_level = 0; zb.zb_blkid = blkid; - (void) arc_read(NULL, dn->dn_objset->os_spa, bp, - dmu_ot[dn->dn_type].ot_byteswap, - NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + if (db) + pbuf = db->db_buf; + else + pbuf = dn->dn_objset->os_phys_buf; + + (void) arc_read(NULL, dn->dn_objset->os_spa, + bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zb); } @@ -1652,16 +1698,13 @@ dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) return (err ? NULL : db); } -dmu_buf_impl_t * +void dbuf_create_bonus(dnode_t *dn) { - dmu_buf_impl_t *db = dn->dn_bonus; - ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); ASSERT(dn->dn_bonus == NULL); - db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); - return (db); + dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL); } #pragma weak dmu_buf_add_ref = dbuf_add_ref @@ -1716,7 +1759,10 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag) dbuf_evict(db); } else { VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0); - mutex_exit(&db->db_mtx); + if (!DBUF_IS_CACHEABLE(db)) + dbuf_clear(db); + else + mutex_exit(&db->db_mtx); } } else { mutex_exit(&db->db_mtx); @@ -1852,15 +1898,8 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) db->db_data_pending = dr; - arc_release(db->db_buf, db); mutex_exit(&db->db_mtx); - - /* - * XXX -- we should design a compression algorithm - * that specializes in arrays of bps. - */ - dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4, - zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx); + dbuf_write(dr, db->db_buf, tx); zio = dr->dr_zio; mutex_enter(&dr->dt.di.dr_mtx); @@ -1878,7 +1917,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dnode_t *dn = db->db_dnode; objset_impl_t *os = dn->dn_objset; uint64_t txg = tx->tx_txg; - int checksum, compress; int blksz; ASSERT(dmu_tx_is_syncing(tx)); @@ -1909,23 +1947,21 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) */ if (db->db_blkid == DB_BONUS_BLKID) { dbuf_dirty_record_t **drp; - /* - * Use dn_phys->dn_bonuslen since db.db_size is the length - * of the bonus buffer in the open transaction rather than - * the syncing transaction. - */ + ASSERT(*datap != NULL); ASSERT3U(db->db_level, ==, 0); ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN); bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); - if (*datap != db->db.db_data) + if (*datap != db->db.db_data) { zio_buf_free(*datap, DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN); + } db->db_data_pending = NULL; drp = &db->db_last_dirty; while (*drp != dr) drp = &(*drp)->dr_next; - ASSERT((*drp)->dr_next == NULL); - *drp = NULL; + ASSERT(dr->dr_next == NULL); + *drp = dr->dr_next; if (dr->dr_dbuf->db_level != 0) { list_destroy(&dr->dt.di.dr_children); mutex_destroy(&dr->dt.di.dr_mtx); @@ -1939,6 +1975,14 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) } /* + * This function may have dropped the db_mtx lock allowing a dmu_sync + * operation to sneak in. As a result, we need to ensure that we + * don't check the dr_override_state until we have returned from + * dbuf_check_blkptr. + */ + dbuf_check_blkptr(dn, db); + + /* * If this buffer is in the middle of an immdiate write, * wait for the synchronous IO to complete. */ @@ -1948,8 +1992,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } - dbuf_check_blkptr(dn, db); - /* * If this dbuf has already been written out via an immediate write, * just complete the write by copying over the new block pointer and @@ -1963,6 +2005,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) zio_fake.io_bp = db->db_blkptr; zio_fake.io_bp_orig = *db->db_blkptr; zio_fake.io_txg = txg; + zio_fake.io_flags = 0; *db->db_blkptr = dr->dt.dl.dr_overridden_by; dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; @@ -1970,8 +2013,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) dr->dr_zio = &zio_fake; mutex_exit(&db->db_mtx); + ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp), + BP_IDENTITY(&zio_fake.io_bp_orig)) || + BP_IS_HOLE(zio_fake.io_bp)); + if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg)) - dsl_dataset_block_kill(os->os_dsl_dataset, + (void) dsl_dataset_block_kill(os->os_dsl_dataset, &zio_fake.io_bp_orig, dn->dn_zio, tx); dbuf_write_ready(&zio_fake, db->db_buf, db); @@ -1997,14 +2044,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) *datap = arc_buf_alloc(os->os_spa, blksz, db, type); bcopy(db->db.db_data, (*datap)->b_data, blksz); } - } else { - /* - * Private object buffers are released here rather - * than in dbuf_dirty() since they are only modified - * in the syncing context and we don't want the - * overhead of making multiple copies of the data. - */ - arc_release(db->db_buf, db); } ASSERT(*datap != NULL); @@ -2012,22 +2051,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) mutex_exit(&db->db_mtx); - /* - * Allow dnode settings to override objset settings, - * except for metadata checksums. - */ - if (dmu_ot[dn->dn_type].ot_metadata) { - checksum = os->os_md_checksum; - compress = zio_compress_select(dn->dn_compress, - os->os_md_compress); - } else { - checksum = zio_checksum_select(dn->dn_checksum, - os->os_checksum); - compress = zio_compress_select(dn->dn_compress, - os->os_compress); - } - - dbuf_write(dr, *datap, checksum, compress, tx); + dbuf_write(dr, *datap, tx); ASSERT(!list_link_active(&dr->dr_dirty_node)); if (dn->dn_object == DMU_META_DNODE_OBJECT) @@ -2063,8 +2087,7 @@ dbuf_sync_list(list_t *list, dmu_tx_t *tx) } static void -dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, - int compress, dmu_tx_t *tx) +dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) { dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn = db->db_dnode; @@ -2072,8 +2095,23 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, dmu_buf_impl_t *parent = db->db_parent; uint64_t txg = tx->tx_txg; zbookmark_t zb; + writeprops_t wp = { 0 }; zio_t *zio; - int zio_flags; + + if (!BP_IS_HOLE(db->db_blkptr) && + (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) { + /* + * Private object buffers are released here rather + * than in dbuf_dirty() since they are only modified + * in the syncing context and we don't want the + * overhead of making multiple copies of the data. + */ + arc_release(data, db); + } else { + ASSERT(arc_released(data)); + /* XXX why do we need to thaw here? */ + arc_buf_thaw(data); + } if (parent != dn->dn_dbuf) { ASSERT(parent && parent->db_data_pending); @@ -2096,17 +2134,22 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum, zb.zb_level = db->db_level; zb.zb_blkid = db->db_blkid; - zio_flags = ZIO_FLAG_MUSTSUCCEED; - if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0) - zio_flags |= ZIO_FLAG_METADATA; + wp.wp_type = dn->dn_type; + wp.wp_level = db->db_level; + wp.wp_copies = os->os_copies; + wp.wp_dncompress = dn->dn_compress; + wp.wp_oscompress = os->os_compress; + wp.wp_dnchecksum = dn->dn_checksum; + wp.wp_oschecksum = os->os_checksum; + if (BP_IS_OLDER(db->db_blkptr, txg)) - dsl_dataset_block_kill( + (void) dsl_dataset_block_kill( os->os_dsl_dataset, db->db_blkptr, zio, tx); - dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress, - dmu_get_replication_level(os, &zb, dn->dn_type), txg, - db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db, - ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb); + dr->dr_zio = arc_write(zio, os->os_spa, &wp, + DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr, + data, dbuf_write_ready, dbuf_write_done, db, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); } /* ARGSUSED */ @@ -2116,27 +2159,33 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) dmu_buf_impl_t *db = vdb; dnode_t *dn = db->db_dnode; objset_impl_t *os = dn->dn_objset; + blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; uint64_t fill = 0; int old_size, new_size, i; + ASSERT(db->db_blkptr == bp); + dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", ""); old_size = bp_get_dasize(os->os_spa, bp_orig); - new_size = bp_get_dasize(os->os_spa, zio->io_bp); + new_size = bp_get_dasize(os->os_spa, bp); - dnode_diduse_space(dn, new_size-old_size); + dnode_diduse_space(dn, new_size - old_size); - if (BP_IS_HOLE(zio->io_bp)) { + if (BP_IS_HOLE(bp)) { dsl_dataset_t *ds = os->os_dsl_dataset; dmu_tx_t *tx = os->os_synctx; if (bp_orig->blk_birth == tx->tx_txg) - dsl_dataset_block_kill(ds, bp_orig, NULL, tx); - ASSERT3U(db->db_blkptr->blk_fill, ==, 0); + (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx); + ASSERT3U(bp->blk_fill, ==, 0); return; } + ASSERT(BP_GET_TYPE(bp) == dn->dn_type); + ASSERT(BP_GET_LEVEL(bp) == db->db_level); + mutex_enter(&db->db_mtx); if (db->db_level == 0) { @@ -2156,32 +2205,31 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) fill = 1; } } else { - blkptr_t *bp = db->db.db_data; + blkptr_t *ibp = db->db.db_data; ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift); - for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) { - if (BP_IS_HOLE(bp)) + for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { + if (BP_IS_HOLE(ibp)) continue; - ASSERT3U(BP_GET_LSIZE(bp), ==, + ASSERT3U(BP_GET_LSIZE(ibp), ==, db->db_level == 1 ? dn->dn_datablksz : (1<<dn->dn_phys->dn_indblkshift)); - fill += bp->blk_fill; + fill += ibp->blk_fill; } } - db->db_blkptr->blk_fill = fill; - BP_SET_TYPE(db->db_blkptr, dn->dn_type); - BP_SET_LEVEL(db->db_blkptr, db->db_level); + bp->blk_fill = fill; mutex_exit(&db->db_mtx); - /* We must do this after we've set the bp's type and level */ - if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) { + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig))); + } else { dsl_dataset_t *ds = os->os_dsl_dataset; dmu_tx_t *tx = os->os_synctx; if (bp_orig->blk_birth == tx->tx_txg) - dsl_dataset_block_kill(ds, bp_orig, NULL, tx); - dsl_dataset_block_born(ds, zio->io_bp, tx); + (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx); + dsl_dataset_block_born(ds, bp, tx); } } @@ -2198,13 +2246,12 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) mutex_enter(&db->db_mtx); drp = &db->db_last_dirty; - while (*drp != db->db_data_pending) - drp = &(*drp)->dr_next; - ASSERT(!list_link_active(&(*drp)->dr_dirty_node)); - ASSERT((*drp)->dr_txg == txg); - ASSERT((*drp)->dr_next == NULL); - dr = *drp; - *drp = NULL; + while ((dr = *drp) != db->db_data_pending) + drp = &dr->dr_next; + ASSERT(!list_link_active(&dr->dr_dirty_node)); + ASSERT(dr->dr_txg == txg); + ASSERT(dr->dr_next == NULL); + *drp = dr->dr_next; if (db->db_level == 0) { ASSERT(db->db_blkid != DB_BONUS_BLKID); @@ -2230,8 +2277,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) >> (db->db_level * epbs), >=, db->db_blkid); arc_set_callback(db->db_buf, dbuf_do_evict, db); } - list_destroy(&dr->dt.di.dr_children); mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); |