Update ZFS from version 6 to 13 and bring some FreeBSD-specific changes.

This bring huge amount of changes, I'll enumerate only user-visible changes: - Delegated Administration Allows regular users to perform ZFS operations, like file system creation, snapshot creation, etc. - L2ARC Level 2 cache for ZFS - allows to use additional disks for cache. Huge performance improvements mostly for random read of mostly static content. - slog Allow to use additional disks for ZFS Intent Log to speed up operations like fsync(2). - vfs.zfs.super_owner Allows regular users to perform privileged operations on files stored on ZFS file systems owned by him. Very careful with this one. - chflags(2) Not all the flags are supported. This still needs work. - ZFSBoot Support to boot off of ZFS pool. Not finished, AFAIK. Submitted by: dfr - Snapshot properties - New failure modes Before if write requested failed, system paniced. Now one can select from one of three failure modes: - panic - panic on write error - wait - wait for disk to reappear - continue - serve read requests if possible, block write requests - Refquota, refreservation properties Just quota and reservation properties, but don't count space consumed by children file systems, clones and snapshots. - Sparse volumes ZVOLs that don't reserve space in the pool. - External attributes Compatible with extattr(2). - NFSv4-ACLs Not sure about the status, might not be complete yet. Submitted by: trasz - Creation-time properties - Regression tests for zpool(8) command. Obtained from: OpenSolaris
author: pjd <pjd@FreeBSD.org> 2008-11-17 20:49:29 +0000
committer: pjd <pjd@FreeBSD.org> 2008-11-17 20:49:29 +0000
commit: bbe899b96e388a8b82439f81ed3707e0d9c6070d (patch)
tree: 81b89fa4ac6467771d5aa291a97f4665981a6108 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
parent: d2f579595c362ce27b4d87e2c40e1c4e09b929e3 (diff)
download: FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.zip
FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.tar.gz
1 files changed, 226 insertions, 179 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
index 94c6308..2494c1e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/dmu.h>
 #include <sys/dmu_impl.h>
@@ -39,17 +37,10 @@
 
 static void dbuf_destroy(dmu_buf_impl_t *db);
 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
-    int compress, dmu_tx_t *tx);
+static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
 static arc_done_func_t dbuf_write_ready;
 static arc_done_func_t dbuf_write_done;
 
-int zfs_mdcomp_disable = 0;
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
-    &zfs_mdcomp_disable, 0, "Disable metadata compression");
-
 /*
  * Global data structures and functions for the dbuf cache.
  */
@@ -311,7 +302,7 @@ dbuf_verify(dmu_buf_impl_t *db)
 	}
 	if (db->db_blkid == DB_BONUS_BLKID) {
 		ASSERT(dn != NULL);
-		ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
+		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
 	} else {
 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
@@ -460,45 +451,45 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 static void
 dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 {
-	blkptr_t *bp;
+	dnode_t *dn = db->db_dnode;
 	zbookmark_t zb;
 	uint32_t aflags = ARC_NOWAIT;
+	arc_buf_t *pbuf;
 
 	ASSERT(!refcount_is_zero(&db->db_holds));
 	/* We need the struct_rwlock to prevent db_blkptr from changing. */
-	ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
+	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 	ASSERT(db->db_state == DB_UNCACHED);
 	ASSERT(db->db_buf == NULL);
 
 	if (db->db_blkid == DB_BONUS_BLKID) {
-		ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
+		int bonuslen = dn->dn_bonuslen;
+
+		ASSERT3U(bonuslen, <=, db->db.db_size);
 		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
-		if (db->db.db_size < DN_MAX_BONUSLEN)
+		arc_space_consume(DN_MAX_BONUSLEN);
+		if (bonuslen < DN_MAX_BONUSLEN)
 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
-		bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
-		    db->db.db_size);
+		bcopy(DN_BONUS(dn->dn_phys), db->db.db_data,
+		    bonuslen);
 		dbuf_update_data(db);
 		db->db_state = DB_CACHED;
 		mutex_exit(&db->db_mtx);
 		return;
 	}
 
-	if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
-		bp = NULL;
-	else
-		bp = db->db_blkptr;
-
-	if (bp == NULL)
-		dprintf_dbuf(db, "blkptr: %s\n", "NULL");
-	else
-		dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
-
-	if (bp == NULL || BP_IS_HOLE(bp)) {
+	/*
+	 * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
+	 * processes the delete record and clears the bp while we are waiting
+	 * for the dn_mtx (resulting in a "no" from block_freed).
+	 */
+	if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
+	    (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
+	    BP_IS_HOLE(db->db_blkptr)))) {
 		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
 
-		ASSERT(bp == NULL || BP_IS_HOLE(bp));
-		dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+		dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
 		    db->db.db_size, db, type));
 		bzero(db->db.db_data, db->db.db_size);
 		db->db_state = DB_CACHED;
@@ -510,6 +501,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 	db->db_state = DB_READ;
 	mutex_exit(&db->db_mtx);
 
+	if (DBUF_IS_L2CACHEABLE(db))
+		aflags |= ARC_L2CACHE;
+
 	zb.zb_objset = db->db_objset->os_dsl_dataset ?
 	    db->db_objset->os_dsl_dataset->ds_object : 0;
 	zb.zb_object = db->db.db_object;
@@ -518,10 +512,13 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 
 	dbuf_add_ref(db, NULL);
 	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
-	ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES);
-	(void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
-	    db->db_level > 0 ? byteswap_uint64_array :
-	    dmu_ot[db->db_dnode->dn_type].ot_byteswap,
+
+	if (db->db_parent)
+		pbuf = db->db_parent->db_buf;
+	else
+		pbuf = db->db_objset->os_phys_buf;
+
+	(void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
 	    dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
 	    (*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
 	    &aflags, &zb);
@@ -546,7 +543,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
 		rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
 
 	prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
-	    (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL;
+	    (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL &&
+	    DBUF_IS_CACHEABLE(db);
 
 	mutex_enter(&db->db_mtx);
 	if (db->db_state == DB_CACHED) {
@@ -661,6 +659,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 	if (db->db_blkid == DB_BONUS_BLKID) {
 		/* Note that the data bufs here are zio_bufs */
 		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
+		arc_space_consume(DN_MAX_BONUSLEN);
 		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
 	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 		int size = db->db.db_size;
@@ -690,7 +689,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
 	/* free this block */
 	if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
 		/* XXX can get silent EIO here */
-		(void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
+		(void) dsl_free(NULL,
+		    spa_get_dsl(db->db_dnode->dn_objset->os_spa),
 		    txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
 	}
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@@ -705,22 +705,50 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
 	arc_release(dr->dt.dl.dr_data, db);
 }
 
+/*
+ * Evict (if its unreferenced) or clear (if its referenced) any level-0
+ * data blocks in the free range, so that any future readers will find
+ * empty blocks.  Also, if we happen accross any level-1 dbufs in the
+ * range that have not already been marked dirty, mark them dirty so
+ * they stay in memory.
+ */
 void
-dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db, *db_next;
 	uint64_t txg = tx->tx_txg;
+	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	uint64_t first_l1 = start >> epbs;
+	uint64_t last_l1 = end >> epbs;
 
-	dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
+	if (end > dn->dn_maxblkid) {
+		end = dn->dn_maxblkid;
+		last_l1 = end >> epbs;
+	}
+	dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
 	mutex_enter(&dn->dn_dbufs_mtx);
 	for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
 		db_next = list_next(&dn->dn_dbufs, db);
 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
+		if (db->db_level == 1 &&
+		    db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
+			mutex_enter(&db->db_mtx);
+			if (db->db_last_dirty &&
+			    db->db_last_dirty->dr_txg < txg) {
+				dbuf_add_ref(db, FTAG);
+				mutex_exit(&db->db_mtx);
+				dbuf_will_dirty(db, tx);
+				dbuf_rele(db, FTAG);
+			} else {
+				mutex_exit(&db->db_mtx);
+			}
+		}
+
 		if (db->db_level != 0)
 			continue;
 		dprintf_dbuf(db, "found buf %s\n", "");
-		if (db->db_blkid < blkid ||
-		    db->db_blkid >= blkid+nblks)
+		if (db->db_blkid < start || db->db_blkid > end)
 			continue;
 
 		/* found a level 0 buffer in the range */
@@ -783,31 +811,28 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
 }
 
 static int
-dbuf_new_block(dmu_buf_impl_t *db)
+dbuf_block_freeable(dmu_buf_impl_t *db)
 {
 	dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
 	uint64_t birth_txg = 0;
 
-	/* Don't count meta-objects */
-	if (ds == NULL)
-		return (FALSE);
-
 	/*
 	 * We don't need any locking to protect db_blkptr:
 	 * If it's syncing, then db_last_dirty will be set
 	 * so we'll ignore db_blkptr.
 	 */
 	ASSERT(MUTEX_HELD(&db->db_mtx));
-	/* If we have been dirtied since the last snapshot, its not new */
 	if (db->db_last_dirty)
 		birth_txg = db->db_last_dirty->dr_txg;
 	else if (db->db_blkptr)
 		birth_txg = db->db_blkptr->blk_birth;
 
+	/* If we don't exist or are in a snapshot, we can't be freed */
 	if (birth_txg)
-		return (!dsl_dataset_block_freeable(ds, birth_txg));
+		return (ds == NULL ||
+		    dsl_dataset_block_freeable(ds, birth_txg));
 	else
-		return (TRUE);
+		return (FALSE);
 }
 
 void
@@ -865,6 +890,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	objset_impl_t *os = dn->dn_objset;
 	dbuf_dirty_record_t **drp, *dr;
 	int drop_struct_lock = FALSE;
+	boolean_t do_free_accounting = B_FALSE;
 	int txgoff = tx->tx_txg & TXG_MASK;
 
 	ASSERT(tx->tx_txg != 0);
@@ -922,20 +948,20 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	drp = &db->db_last_dirty;
 	ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
 	    db->db.db_object == DMU_META_DNODE_OBJECT);
-	while (*drp && (*drp)->dr_txg > tx->tx_txg)
-		drp = &(*drp)->dr_next;
-	if (*drp && (*drp)->dr_txg == tx->tx_txg) {
+	while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
+		drp = &dr->dr_next;
+	if (dr && dr->dr_txg == tx->tx_txg) {
 		if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
 			/*
 			 * If this buffer has already been written out,
 			 * we now need to reset its state.
 			 */
-			dbuf_unoverride(*drp);
+			dbuf_unoverride(dr);
 			if (db->db.db_object != DMU_META_DNODE_OBJECT)
 				arc_buf_thaw(db->db_buf);
 		}
 		mutex_exit(&db->db_mtx);
-		return (*drp);
+		return (dr);
 	}
 
 	/*
@@ -966,6 +992,18 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
 
+	if (db->db_blkid != DB_BONUS_BLKID) {
+		/*
+		 * Update the accounting.
+		 * Note: we delay "free accounting" until after we drop
+		 * the db_mtx.  This keeps us from grabbing other locks
+		 * (and possibly deadlocking) in bp_get_dasize() while
+		 * also holding the db_mtx.
+		 */
+		dnode_willuse_space(dn, db->db.db_size, tx);
+		do_free_accounting = dbuf_block_freeable(db);
+	}
+
 	/*
 	 * If this buffer is dirty in an old transaction group we need
 	 * to make a copy of it so that the changes we make in this
@@ -1015,25 +1053,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		db->db_freed_in_flight = FALSE;
 	}
 
-	if (db->db_blkid != DB_BONUS_BLKID) {
-		/*
-		 * Update the accounting.
-		 */
-		if (!dbuf_new_block(db) && db->db_blkptr) {
-			/*
-			 * This is only a guess -- if the dbuf is dirty
-			 * in a previous txg, we don't know how much
-			 * space it will use on disk yet.  We should
-			 * really have the struct_rwlock to access
-			 * db_blkptr, but since this is just a guess,
-			 * it's OK if we get an odd answer.
-			 */
-			dnode_willuse_space(dn,
-			    -bp_get_dasize(os->os_spa, db->db_blkptr), tx);
-		}
-		dnode_willuse_space(dn, db->db.db_size, tx);
-	}
-
 	/*
 	 * This buffer is now part of this txg
 	 */
@@ -1050,11 +1069,19 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		mutex_exit(&dn->dn_mtx);
 		dnode_setdirty(dn, tx);
 		return (dr);
-	}
-
-	if (db->db_level == 0) {
-		dnode_new_blkid(dn, db->db_blkid, tx);
-		ASSERT(dn->dn_maxblkid >= db->db_blkid);
+	} else if (do_free_accounting) {
+		blkptr_t *bp = db->db_blkptr;
+		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
+		    bp_get_dasize(os->os_spa, bp) : db->db.db_size;
+		/*
+		 * This is only a guess -- if the dbuf is dirty
+		 * in a previous txg, we don't know how much
+		 * space it will use on disk yet.  We should
+		 * really have the struct_rwlock to access
+		 * db_blkptr, but since this is just a guess,
+		 * it's OK if we get an odd answer.
+		 */
+		dnode_willuse_space(dn, -willfree, tx);
 	}
 
 	if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
@@ -1062,6 +1089,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		drop_struct_lock = TRUE;
 	}
 
+	if (db->db_level == 0) {
+		dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
+		ASSERT(dn->dn_maxblkid >= db->db_blkid);
+	}
+
 	if (db->db_level+1 < dn->dn_nlevels) {
 		dmu_buf_impl_t *parent = db->db_parent;
 		dbuf_dirty_record_t *di;
@@ -1115,7 +1147,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dnode_t *dn = db->db_dnode;
 	uint64_t txg = tx->tx_txg;
-	dbuf_dirty_record_t *dr;
+	dbuf_dirty_record_t *dr, **drp;
 
 	ASSERT(txg != 0);
 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
@@ -1125,7 +1157,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	/*
 	 * If this buffer is not dirty, we're done.
 	 */
-	for (dr = db->db_last_dirty; dr; dr = dr->dr_next)
+	for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
 		if (dr->dr_txg <= txg)
 			break;
 	if (dr == NULL || dr->dr_txg < txg) {
@@ -1155,14 +1187,14 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
 	/* XXX would be nice to fix up dn_towrite_space[] */
 
-	db->db_last_dirty = dr->dr_next;
+	*drp = dr->dr_next;
 
 	if (dr->dr_parent) {
 		mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
 		list_remove(&dr->dr_parent->dt.di.dr_children, dr);
 		mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
 	} else if (db->db_level+1 == dn->dn_nlevels) {
-		ASSERT3P(db->db_parent, ==, dn->dn_dbuf);
+		ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
 		mutex_enter(&dn->dn_mtx);
 		list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
 		mutex_exit(&dn->dn_mtx);
@@ -1178,8 +1210,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	} else {
 		ASSERT(db->db_buf != NULL);
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
-		list_destroy(&dr->dt.di.dr_children);
 		mutex_destroy(&dr->dt.di.dr_mtx);
+		list_destroy(&dr->dt.di.dr_children);
 	}
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
 
@@ -1204,7 +1236,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 void
 dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
-	int rf = DB_RF_MUST_SUCCEED;
+	int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
 
 	ASSERT(tx->tx_txg != 0);
 	ASSERT(!refcount_is_zero(&db->db_holds));
@@ -1282,8 +1314,10 @@ dbuf_clear(dmu_buf_impl_t *db)
 
 	if (db->db_state == DB_CACHED) {
 		ASSERT(db->db.db_data != NULL);
-		if (db->db_blkid == DB_BONUS_BLKID)
+		if (db->db_blkid == DB_BONUS_BLKID) {
 			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
+			arc_space_return(DN_MAX_BONUSLEN);
+		}
 		db->db.db_data = NULL;
 		db->db_state = DB_UNCACHED;
 	}
@@ -1297,6 +1331,7 @@ dbuf_clear(dmu_buf_impl_t *db)
 	if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
 		list_remove(&dn->dn_dbufs, db);
 		dnode_rele(dn, db);
+		db->db_dnode = NULL;
 	}
 
 	if (db->db_buf)
@@ -1397,10 +1432,13 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 
 	if (blkid == DB_BONUS_BLKID) {
 		ASSERT3P(parent, ==, dn->dn_dbuf);
-		db->db.db_size = dn->dn_bonuslen;
+		db->db.db_size = DN_MAX_BONUSLEN -
+		    (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+		ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
 		db->db.db_offset = DB_BONUS_BLKID;
 		db->db_state = DB_UNCACHED;
 		/* the bonus dbuf is not placed in the hash table */
+		arc_space_consume(sizeof (dmu_buf_impl_t));
 		return (db);
 	} else {
 		int blocksize =
@@ -1427,6 +1465,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 	list_insert_head(&dn->dn_dbufs, db);
 	db->db_state = DB_UNCACHED;
 	mutex_exit(&dn->dn_dbufs_mtx);
+	arc_space_consume(sizeof (dmu_buf_impl_t));
 
 	if (parent && parent != dn->dn_dbuf)
 		dbuf_add_ref(parent, db);
@@ -1469,31 +1508,33 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	ASSERT(refcount_is_zero(&db->db_holds));
 
 	if (db->db_blkid != DB_BONUS_BLKID) {
-		dnode_t *dn = db->db_dnode;
-
 		/*
 		 * If this dbuf is still on the dn_dbufs list,
 		 * remove it from that list.
 		 */
-		if (list_link_active(&db->db_link)) {
+		if (db->db_dnode) {
+			dnode_t *dn = db->db_dnode;
+
 			mutex_enter(&dn->dn_dbufs_mtx);
 			list_remove(&dn->dn_dbufs, db);
 			mutex_exit(&dn->dn_dbufs_mtx);
 
 			dnode_rele(dn, db);
+			db->db_dnode = NULL;
 		}
 		dbuf_hash_remove(db);
 	}
 	db->db_parent = NULL;
-	db->db_dnode = NULL;
 	db->db_buf = NULL;
 
+	ASSERT(!list_link_active(&db->db_link));
 	ASSERT(db->db.db_data == NULL);
 	ASSERT(db->db_hash_next == NULL);
 	ASSERT(db->db_blkptr == NULL);
 	ASSERT(db->db_data_pending == NULL);
 
 	kmem_cache_free(dbuf_cache, db);
+	arc_space_return(sizeof (dmu_buf_impl_t));
 }
 
 void
@@ -1525,6 +1566,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
 
 	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
 		if (bp && !BP_IS_HOLE(bp)) {
+			arc_buf_t *pbuf;
 			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
 			zbookmark_t zb;
 			zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
@@ -1533,9 +1575,13 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
 			zb.zb_level = 0;
 			zb.zb_blkid = blkid;
 
-			(void) arc_read(NULL, dn->dn_objset->os_spa, bp,
-			    dmu_ot[dn->dn_type].ot_byteswap,
-			    NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+			if (db)
+				pbuf = db->db_buf;
+			else
+				pbuf = dn->dn_objset->os_phys_buf;
+
+			(void) arc_read(NULL, dn->dn_objset->os_spa,
+			    bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
 			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
 			    &aflags, &zb);
 		}
@@ -1652,16 +1698,13 @@ dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
 	return (err ? NULL : db);
 }
 
-dmu_buf_impl_t *
+void
 dbuf_create_bonus(dnode_t *dn)
 {
-	dmu_buf_impl_t *db = dn->dn_bonus;
-
 	ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
 
 	ASSERT(dn->dn_bonus == NULL);
-	db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
-	return (db);
+	dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
 }
 
 #pragma weak dmu_buf_add_ref = dbuf_add_ref
@@ -1716,7 +1759,10 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag)
 			dbuf_evict(db);
 		} else {
 			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
-			mutex_exit(&db->db_mtx);
+			if (!DBUF_IS_CACHEABLE(db))
+				dbuf_clear(db);
+			else
+				mutex_exit(&db->db_mtx);
 		}
 	} else {
 		mutex_exit(&db->db_mtx);
@@ -1852,15 +1898,8 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 
 	db->db_data_pending = dr;
 
-	arc_release(db->db_buf, db);
 	mutex_exit(&db->db_mtx);
-
-	/*
-	 * XXX -- we should design a compression algorithm
-	 * that specializes in arrays of bps.
-	 */
-	dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4,
-	    zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx);
+	dbuf_write(dr, db->db_buf, tx);
 
 	zio = dr->dr_zio;
 	mutex_enter(&dr->dt.di.dr_mtx);
@@ -1878,7 +1917,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	dnode_t *dn = db->db_dnode;
 	objset_impl_t *os = dn->dn_objset;
 	uint64_t txg = tx->tx_txg;
-	int checksum, compress;
 	int blksz;
 
 	ASSERT(dmu_tx_is_syncing(tx));
@@ -1909,23 +1947,21 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	 */
 	if (db->db_blkid == DB_BONUS_BLKID) {
 		dbuf_dirty_record_t **drp;
-		/*
-		 * Use dn_phys->dn_bonuslen since db.db_size is the length
-		 * of the bonus buffer in the open transaction rather than
-		 * the syncing transaction.
-		 */
+
 		ASSERT(*datap != NULL);
 		ASSERT3U(db->db_level, ==, 0);
 		ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
 		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
-		if (*datap != db->db.db_data)
+		if (*datap != db->db.db_data) {
 			zio_buf_free(*datap, DN_MAX_BONUSLEN);
+			arc_space_return(DN_MAX_BONUSLEN);
+		}
 		db->db_data_pending = NULL;
 		drp = &db->db_last_dirty;
 		while (*drp != dr)
 			drp = &(*drp)->dr_next;
-		ASSERT((*drp)->dr_next == NULL);
-		*drp = NULL;
+		ASSERT(dr->dr_next == NULL);
+		*drp = dr->dr_next;
 		if (dr->dr_dbuf->db_level != 0) {
 			list_destroy(&dr->dt.di.dr_children);
 			mutex_destroy(&dr->dt.di.dr_mtx);
@@ -1939,6 +1975,14 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	}
 
 	/*
+	 * This function may have dropped the db_mtx lock allowing a dmu_sync
+	 * operation to sneak in. As a result, we need to ensure that we
+	 * don't check the dr_override_state until we have returned from
+	 * dbuf_check_blkptr.
+	 */
+	dbuf_check_blkptr(dn, db);
+
+	/*
 	 * If this buffer is in the middle of an immdiate write,
 	 * wait for the synchronous IO to complete.
 	 */
@@ -1948,8 +1992,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
 	}
 
-	dbuf_check_blkptr(dn, db);
-
 	/*
 	 * If this dbuf has already been written out via an immediate write,
 	 * just complete the write by copying over the new block pointer and
@@ -1963,6 +2005,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 		zio_fake.io_bp = db->db_blkptr;
 		zio_fake.io_bp_orig = *db->db_blkptr;
 		zio_fake.io_txg = txg;
+		zio_fake.io_flags = 0;
 
 		*db->db_blkptr = dr->dt.dl.dr_overridden_by;
 		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@@ -1970,8 +2013,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 		dr->dr_zio = &zio_fake;
 		mutex_exit(&db->db_mtx);
 
+		ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp),
+		    BP_IDENTITY(&zio_fake.io_bp_orig)) ||
+		    BP_IS_HOLE(zio_fake.io_bp));
+
 		if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
-			dsl_dataset_block_kill(os->os_dsl_dataset,
+			(void) dsl_dataset_block_kill(os->os_dsl_dataset,
 			    &zio_fake.io_bp_orig, dn->dn_zio, tx);
 
 		dbuf_write_ready(&zio_fake, db->db_buf, db);
@@ -1997,14 +2044,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 			*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
 			bcopy(db->db.db_data, (*datap)->b_data, blksz);
 		}
-	} else {
-		/*
-		 * Private object buffers are released here rather
-		 * than in dbuf_dirty() since they are only modified
-		 * in the syncing context and we don't want the
-		 * overhead of making multiple copies of the data.
-		 */
-		arc_release(db->db_buf, db);
 	}
 
 	ASSERT(*datap != NULL);
@@ -2012,22 +2051,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 
 	mutex_exit(&db->db_mtx);
 
-	/*
-	 * Allow dnode settings to override objset settings,
-	 * except for metadata checksums.
-	 */
-	if (dmu_ot[dn->dn_type].ot_metadata) {
-		checksum = os->os_md_checksum;
-		compress = zio_compress_select(dn->dn_compress,
-		    os->os_md_compress);
-	} else {
-		checksum = zio_checksum_select(dn->dn_checksum,
-		    os->os_checksum);
-		compress = zio_compress_select(dn->dn_compress,
-		    os->os_compress);
-	}
-
-	dbuf_write(dr, *datap, checksum, compress, tx);
+	dbuf_write(dr, *datap, tx);
 
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
 	if (dn->dn_object == DMU_META_DNODE_OBJECT)
@@ -2063,8 +2087,7 @@ dbuf_sync_list(list_t *list, dmu_tx_t *tx)
 }
 
 static void
-dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
-    int compress, dmu_tx_t *tx)
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn = db->db_dnode;
@@ -2072,8 +2095,23 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
 	dmu_buf_impl_t *parent = db->db_parent;
 	uint64_t txg = tx->tx_txg;
 	zbookmark_t zb;
+	writeprops_t wp = { 0 };
 	zio_t *zio;
-	int zio_flags;
+
+	if (!BP_IS_HOLE(db->db_blkptr) &&
+	    (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) {
+		/*
+		 * Private object buffers are released here rather
+		 * than in dbuf_dirty() since they are only modified
+		 * in the syncing context and we don't want the
+		 * overhead of making multiple copies of the data.
+		 */
+		arc_release(data, db);
+	} else {
+		ASSERT(arc_released(data));
+		/* XXX why do we need to thaw here? */
+		arc_buf_thaw(data);
+	}
 
 	if (parent != dn->dn_dbuf) {
 		ASSERT(parent && parent->db_data_pending);
@@ -2096,17 +2134,22 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
 	zb.zb_level = db->db_level;
 	zb.zb_blkid = db->db_blkid;
 
-	zio_flags = ZIO_FLAG_MUSTSUCCEED;
-	if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0)
-		zio_flags |= ZIO_FLAG_METADATA;
+	wp.wp_type = dn->dn_type;
+	wp.wp_level = db->db_level;
+	wp.wp_copies = os->os_copies;
+	wp.wp_dncompress = dn->dn_compress;
+	wp.wp_oscompress = os->os_compress;
+	wp.wp_dnchecksum = dn->dn_checksum;
+	wp.wp_oschecksum = os->os_checksum;
+
 	if (BP_IS_OLDER(db->db_blkptr, txg))
-		dsl_dataset_block_kill(
+		(void) dsl_dataset_block_kill(
 		    os->os_dsl_dataset, db->db_blkptr, zio, tx);
 
-	dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress,
-	    dmu_get_replication_level(os, &zb, dn->dn_type), txg,
-	    db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db,
-	    ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
+	dr->dr_zio = arc_write(zio, os->os_spa, &wp,
+	    DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr,
+	    data, dbuf_write_ready, dbuf_write_done, db,
+	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 }
 
 /* ARGSUSED */
@@ -2116,27 +2159,33 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 	dmu_buf_impl_t *db = vdb;
 	dnode_t *dn = db->db_dnode;
 	objset_impl_t *os = dn->dn_objset;
+	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
 	uint64_t fill = 0;
 	int old_size, new_size, i;
 
+	ASSERT(db->db_blkptr == bp);
+
 	dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
 
 	old_size = bp_get_dasize(os->os_spa, bp_orig);
-	new_size = bp_get_dasize(os->os_spa, zio->io_bp);
+	new_size = bp_get_dasize(os->os_spa, bp);
 
-	dnode_diduse_space(dn, new_size-old_size);
+	dnode_diduse_space(dn, new_size - old_size);
 
-	if (BP_IS_HOLE(zio->io_bp)) {
+	if (BP_IS_HOLE(bp)) {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		dmu_tx_t *tx = os->os_synctx;
 
 		if (bp_orig->blk_birth == tx->tx_txg)
-			dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
-		ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
+			(void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
+		ASSERT3U(bp->blk_fill, ==, 0);
 		return;
 	}
 
+	ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
+	ASSERT(BP_GET_LEVEL(bp) == db->db_level);
+
 	mutex_enter(&db->db_mtx);
 
 	if (db->db_level == 0) {
@@ -2156,32 +2205,31 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 			fill = 1;
 		}
 	} else {
-		blkptr_t *bp = db->db.db_data;
+		blkptr_t *ibp = db->db.db_data;
 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
-		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
-			if (BP_IS_HOLE(bp))
+		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
+			if (BP_IS_HOLE(ibp))
 				continue;
-			ASSERT3U(BP_GET_LSIZE(bp), ==,
+			ASSERT3U(BP_GET_LSIZE(ibp), ==,
 			    db->db_level == 1 ? dn->dn_datablksz :
 			    (1<<dn->dn_phys->dn_indblkshift));
-			fill += bp->blk_fill;
+			fill += ibp->blk_fill;
 		}
 	}
 
-	db->db_blkptr->blk_fill = fill;
-	BP_SET_TYPE(db->db_blkptr, dn->dn_type);
-	BP_SET_LEVEL(db->db_blkptr, db->db_level);
+	bp->blk_fill = fill;
 
 	mutex_exit(&db->db_mtx);
 
-	/* We must do this after we've set the bp's type and level */
-	if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) {
+	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+		ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
+	} else {
 		dsl_dataset_t *ds = os->os_dsl_dataset;
 		dmu_tx_t *tx = os->os_synctx;
 
 		if (bp_orig->blk_birth == tx->tx_txg)
-			dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
-		dsl_dataset_block_born(ds, zio->io_bp, tx);
+			(void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
+		dsl_dataset_block_born(ds, bp, tx);
 	}
 }
 
@@ -2198,13 +2246,12 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 	mutex_enter(&db->db_mtx);
 
 	drp = &db->db_last_dirty;
-	while (*drp != db->db_data_pending)
-		drp = &(*drp)->dr_next;
-	ASSERT(!list_link_active(&(*drp)->dr_dirty_node));
-	ASSERT((*drp)->dr_txg == txg);
-	ASSERT((*drp)->dr_next == NULL);
-	dr = *drp;
-	*drp = NULL;
+	while ((dr = *drp) != db->db_data_pending)
+		drp = &dr->dr_next;
+	ASSERT(!list_link_active(&dr->dr_dirty_node));
+	ASSERT(dr->dr_txg == txg);
+	ASSERT(dr->dr_next == NULL);
+	*drp = dr->dr_next;
 
 	if (db->db_level == 0) {
 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
@@ -2230,8 +2277,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 			    >> (db->db_level * epbs), >=, db->db_blkid);
 			arc_set_callback(db->db_buf, dbuf_do_evict, db);
 		}
-		list_destroy(&dr->dt.di.dr_children);
 		mutex_destroy(&dr->dt.di.dr_mtx);
+		list_destroy(&dr->dt.di.dr_children);
 	}
 	kmem_free(dr, sizeof (dbuf_dirty_record_t));
author	pjd <pjd@FreeBSD.org>	2008-11-17 20:49:29 +0000
committer	pjd <pjd@FreeBSD.org>	2008-11-17 20:49:29 +0000
commit	bbe899b96e388a8b82439f81ed3707e0d9c6070d (patch)
tree	81b89fa4ac6467771d5aa291a97f4665981a6108 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
parent	d2f579595c362ce27b4d87e2c40e1c4e09b929e3 (diff)
download	FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.zip FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.tar.gz