summaryrefslogtreecommitdiffstats
path: root/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
diff options
context:
space:
mode:
authorpjd <pjd@FreeBSD.org>2008-11-17 20:49:29 +0000
committerpjd <pjd@FreeBSD.org>2008-11-17 20:49:29 +0000
commitbbe899b96e388a8b82439f81ed3707e0d9c6070d (patch)
tree81b89fa4ac6467771d5aa291a97f4665981a6108 /sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
parentd2f579595c362ce27b4d87e2c40e1c4e09b929e3 (diff)
downloadFreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.zip
FreeBSD-src-bbe899b96e388a8b82439f81ed3707e0d9c6070d.tar.gz
Update ZFS from version 6 to 13 and bring some FreeBSD-specific changes.
This bring huge amount of changes, I'll enumerate only user-visible changes: - Delegated Administration Allows regular users to perform ZFS operations, like file system creation, snapshot creation, etc. - L2ARC Level 2 cache for ZFS - allows to use additional disks for cache. Huge performance improvements mostly for random read of mostly static content. - slog Allow to use additional disks for ZFS Intent Log to speed up operations like fsync(2). - vfs.zfs.super_owner Allows regular users to perform privileged operations on files stored on ZFS file systems owned by him. Very careful with this one. - chflags(2) Not all the flags are supported. This still needs work. - ZFSBoot Support to boot off of ZFS pool. Not finished, AFAIK. Submitted by: dfr - Snapshot properties - New failure modes Before if write requested failed, system paniced. Now one can select from one of three failure modes: - panic - panic on write error - wait - wait for disk to reappear - continue - serve read requests if possible, block write requests - Refquota, refreservation properties Just quota and reservation properties, but don't count space consumed by children file systems, clones and snapshots. - Sparse volumes ZVOLs that don't reserve space in the pool. - External attributes Compatible with extattr(2). - NFSv4-ACLs Not sure about the status, might not be complete yet. Submitted by: trasz - Creation-time properties - Regression tests for zpool(8) command. Obtained from: OpenSolaris
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c405
1 files changed, 226 insertions, 179 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
index 94c6308..2494c1e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/zfs_context.h>
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
@@ -39,17 +37,10 @@
static void dbuf_destroy(dmu_buf_impl_t *db);
static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
-static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
- int compress, dmu_tx_t *tx);
+static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
static arc_done_func_t dbuf_write_ready;
static arc_done_func_t dbuf_write_done;
-int zfs_mdcomp_disable = 0;
-SYSCTL_DECL(_vfs_zfs);
-TUNABLE_INT("vfs.zfs.mdcomp_disable", &zfs_mdcomp_disable);
-SYSCTL_INT(_vfs_zfs, OID_AUTO, mdcomp_disable, CTLFLAG_RDTUN,
- &zfs_mdcomp_disable, 0, "Disable metadata compression");
-
/*
* Global data structures and functions for the dbuf cache.
*/
@@ -311,7 +302,7 @@ dbuf_verify(dmu_buf_impl_t *db)
}
if (db->db_blkid == DB_BONUS_BLKID) {
ASSERT(dn != NULL);
- ASSERT3U(db->db.db_size, ==, dn->dn_bonuslen);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
ASSERT3U(db->db.db_offset, ==, DB_BONUS_BLKID);
} else {
ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
@@ -460,45 +451,45 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb)
static void
dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
{
- blkptr_t *bp;
+ dnode_t *dn = db->db_dnode;
zbookmark_t zb;
uint32_t aflags = ARC_NOWAIT;
+ arc_buf_t *pbuf;
ASSERT(!refcount_is_zero(&db->db_holds));
/* We need the struct_rwlock to prevent db_blkptr from changing. */
- ASSERT(RW_LOCK_HELD(&db->db_dnode->dn_struct_rwlock));
+ ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
ASSERT(MUTEX_HELD(&db->db_mtx));
ASSERT(db->db_state == DB_UNCACHED);
ASSERT(db->db_buf == NULL);
if (db->db_blkid == DB_BONUS_BLKID) {
- ASSERT3U(db->db_dnode->dn_bonuslen, ==, db->db.db_size);
+ int bonuslen = dn->dn_bonuslen;
+
+ ASSERT3U(bonuslen, <=, db->db.db_size);
db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
- if (db->db.db_size < DN_MAX_BONUSLEN)
+ arc_space_consume(DN_MAX_BONUSLEN);
+ if (bonuslen < DN_MAX_BONUSLEN)
bzero(db->db.db_data, DN_MAX_BONUSLEN);
- bcopy(DN_BONUS(db->db_dnode->dn_phys), db->db.db_data,
- db->db.db_size);
+ bcopy(DN_BONUS(dn->dn_phys), db->db.db_data,
+ bonuslen);
dbuf_update_data(db);
db->db_state = DB_CACHED;
mutex_exit(&db->db_mtx);
return;
}
- if (db->db_level == 0 && dnode_block_freed(db->db_dnode, db->db_blkid))
- bp = NULL;
- else
- bp = db->db_blkptr;
-
- if (bp == NULL)
- dprintf_dbuf(db, "blkptr: %s\n", "NULL");
- else
- dprintf_dbuf_bp(db, bp, "%s", "blkptr:");
-
- if (bp == NULL || BP_IS_HOLE(bp)) {
+ /*
+ * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync()
+ * processes the delete record and clears the bp while we are waiting
+ * for the dn_mtx (resulting in a "no" from block_freed).
+ */
+ if (db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr) ||
+ (db->db_level == 0 && (dnode_block_freed(dn, db->db_blkid) ||
+ BP_IS_HOLE(db->db_blkptr)))) {
arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
- ASSERT(bp == NULL || BP_IS_HOLE(bp));
- dbuf_set_data(db, arc_buf_alloc(db->db_dnode->dn_objset->os_spa,
+ dbuf_set_data(db, arc_buf_alloc(dn->dn_objset->os_spa,
db->db.db_size, db, type));
bzero(db->db.db_data, db->db.db_size);
db->db_state = DB_CACHED;
@@ -510,6 +501,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
db->db_state = DB_READ;
mutex_exit(&db->db_mtx);
+ if (DBUF_IS_L2CACHEABLE(db))
+ aflags |= ARC_L2CACHE;
+
zb.zb_objset = db->db_objset->os_dsl_dataset ?
db->db_objset->os_dsl_dataset->ds_object : 0;
zb.zb_object = db->db.db_object;
@@ -518,10 +512,13 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
dbuf_add_ref(db, NULL);
/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
- ASSERT3U(db->db_dnode->dn_type, <, DMU_OT_NUMTYPES);
- (void) arc_read(zio, db->db_dnode->dn_objset->os_spa, bp,
- db->db_level > 0 ? byteswap_uint64_array :
- dmu_ot[db->db_dnode->dn_type].ot_byteswap,
+
+ if (db->db_parent)
+ pbuf = db->db_parent->db_buf;
+ else
+ pbuf = db->db_objset->os_phys_buf;
+
+ (void) arc_read(zio, dn->dn_objset->os_spa, db->db_blkptr, pbuf,
dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ,
(*flags & DB_RF_CANFAIL) ? ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED,
&aflags, &zb);
@@ -546,7 +543,8 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
rw_enter(&db->db_dnode->dn_struct_rwlock, RW_READER);
prefetch = db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID &&
- (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL;
+ (flags & DB_RF_NOPREFETCH) == 0 && db->db_dnode != NULL &&
+ DBUF_IS_CACHEABLE(db);
mutex_enter(&db->db_mtx);
if (db->db_state == DB_CACHED) {
@@ -661,6 +659,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
if (db->db_blkid == DB_BONUS_BLKID) {
/* Note that the data bufs here are zio_bufs */
dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
+ arc_space_consume(DN_MAX_BONUSLEN);
bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
int size = db->db.db_size;
@@ -690,7 +689,8 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
/* free this block */
if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
/* XXX can get silent EIO here */
- (void) arc_free(NULL, db->db_dnode->dn_objset->os_spa,
+ (void) dsl_free(NULL,
+ spa_get_dsl(db->db_dnode->dn_objset->os_spa),
txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
}
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@@ -705,22 +705,50 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
arc_release(dr->dt.dl.dr_data, db);
}
+/*
+ * Evict (if its unreferenced) or clear (if its referenced) any level-0
+ * data blocks in the free range, so that any future readers will find
+ * empty blocks. Also, if we happen accross any level-1 dbufs in the
+ * range that have not already been marked dirty, mark them dirty so
+ * they stay in memory.
+ */
void
-dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
+dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
{
dmu_buf_impl_t *db, *db_next;
uint64_t txg = tx->tx_txg;
+ int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+ uint64_t first_l1 = start >> epbs;
+ uint64_t last_l1 = end >> epbs;
- dprintf_dnode(dn, "blkid=%llu nblks=%llu\n", blkid, nblks);
+ if (end > dn->dn_maxblkid) {
+ end = dn->dn_maxblkid;
+ last_l1 = end >> epbs;
+ }
+ dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
mutex_enter(&dn->dn_dbufs_mtx);
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
ASSERT(db->db_blkid != DB_BONUS_BLKID);
+
+ if (db->db_level == 1 &&
+ db->db_blkid >= first_l1 && db->db_blkid <= last_l1) {
+ mutex_enter(&db->db_mtx);
+ if (db->db_last_dirty &&
+ db->db_last_dirty->dr_txg < txg) {
+ dbuf_add_ref(db, FTAG);
+ mutex_exit(&db->db_mtx);
+ dbuf_will_dirty(db, tx);
+ dbuf_rele(db, FTAG);
+ } else {
+ mutex_exit(&db->db_mtx);
+ }
+ }
+
if (db->db_level != 0)
continue;
dprintf_dbuf(db, "found buf %s\n", "");
- if (db->db_blkid < blkid ||
- db->db_blkid >= blkid+nblks)
+ if (db->db_blkid < start || db->db_blkid > end)
continue;
/* found a level 0 buffer in the range */
@@ -783,31 +811,28 @@ dbuf_free_range(dnode_t *dn, uint64_t blkid, uint64_t nblks, dmu_tx_t *tx)
}
static int
-dbuf_new_block(dmu_buf_impl_t *db)
+dbuf_block_freeable(dmu_buf_impl_t *db)
{
dsl_dataset_t *ds = db->db_objset->os_dsl_dataset;
uint64_t birth_txg = 0;
- /* Don't count meta-objects */
- if (ds == NULL)
- return (FALSE);
-
/*
* We don't need any locking to protect db_blkptr:
* If it's syncing, then db_last_dirty will be set
* so we'll ignore db_blkptr.
*/
ASSERT(MUTEX_HELD(&db->db_mtx));
- /* If we have been dirtied since the last snapshot, its not new */
if (db->db_last_dirty)
birth_txg = db->db_last_dirty->dr_txg;
else if (db->db_blkptr)
birth_txg = db->db_blkptr->blk_birth;
+ /* If we don't exist or are in a snapshot, we can't be freed */
if (birth_txg)
- return (!dsl_dataset_block_freeable(ds, birth_txg));
+ return (ds == NULL ||
+ dsl_dataset_block_freeable(ds, birth_txg));
else
- return (TRUE);
+ return (FALSE);
}
void
@@ -865,6 +890,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
objset_impl_t *os = dn->dn_objset;
dbuf_dirty_record_t **drp, *dr;
int drop_struct_lock = FALSE;
+ boolean_t do_free_accounting = B_FALSE;
int txgoff = tx->tx_txg & TXG_MASK;
ASSERT(tx->tx_txg != 0);
@@ -922,20 +948,20 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
drp = &db->db_last_dirty;
ASSERT(*drp == NULL || (*drp)->dr_txg <= tx->tx_txg ||
db->db.db_object == DMU_META_DNODE_OBJECT);
- while (*drp && (*drp)->dr_txg > tx->tx_txg)
- drp = &(*drp)->dr_next;
- if (*drp && (*drp)->dr_txg == tx->tx_txg) {
+ while ((dr = *drp) != NULL && dr->dr_txg > tx->tx_txg)
+ drp = &dr->dr_next;
+ if (dr && dr->dr_txg == tx->tx_txg) {
if (db->db_level == 0 && db->db_blkid != DB_BONUS_BLKID) {
/*
* If this buffer has already been written out,
* we now need to reset its state.
*/
- dbuf_unoverride(*drp);
+ dbuf_unoverride(dr);
if (db->db.db_object != DMU_META_DNODE_OBJECT)
arc_buf_thaw(db->db_buf);
}
mutex_exit(&db->db_mtx);
- return (*drp);
+ return (dr);
}
/*
@@ -966,6 +992,18 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
+ if (db->db_blkid != DB_BONUS_BLKID) {
+ /*
+ * Update the accounting.
+ * Note: we delay "free accounting" until after we drop
+ * the db_mtx. This keeps us from grabbing other locks
+ * (and possibly deadlocking) in bp_get_dasize() while
+ * also holding the db_mtx.
+ */
+ dnode_willuse_space(dn, db->db.db_size, tx);
+ do_free_accounting = dbuf_block_freeable(db);
+ }
+
/*
* If this buffer is dirty in an old transaction group we need
* to make a copy of it so that the changes we make in this
@@ -1015,25 +1053,6 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
db->db_freed_in_flight = FALSE;
}
- if (db->db_blkid != DB_BONUS_BLKID) {
- /*
- * Update the accounting.
- */
- if (!dbuf_new_block(db) && db->db_blkptr) {
- /*
- * This is only a guess -- if the dbuf is dirty
- * in a previous txg, we don't know how much
- * space it will use on disk yet. We should
- * really have the struct_rwlock to access
- * db_blkptr, but since this is just a guess,
- * it's OK if we get an odd answer.
- */
- dnode_willuse_space(dn,
- -bp_get_dasize(os->os_spa, db->db_blkptr), tx);
- }
- dnode_willuse_space(dn, db->db.db_size, tx);
- }
-
/*
* This buffer is now part of this txg
*/
@@ -1050,11 +1069,19 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
mutex_exit(&dn->dn_mtx);
dnode_setdirty(dn, tx);
return (dr);
- }
-
- if (db->db_level == 0) {
- dnode_new_blkid(dn, db->db_blkid, tx);
- ASSERT(dn->dn_maxblkid >= db->db_blkid);
+ } else if (do_free_accounting) {
+ blkptr_t *bp = db->db_blkptr;
+ int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
+ bp_get_dasize(os->os_spa, bp) : db->db.db_size;
+ /*
+ * This is only a guess -- if the dbuf is dirty
+ * in a previous txg, we don't know how much
+ * space it will use on disk yet. We should
+ * really have the struct_rwlock to access
+ * db_blkptr, but since this is just a guess,
+ * it's OK if we get an odd answer.
+ */
+ dnode_willuse_space(dn, -willfree, tx);
}
if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) {
@@ -1062,6 +1089,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
drop_struct_lock = TRUE;
}
+ if (db->db_level == 0) {
+ dnode_new_blkid(dn, db->db_blkid, tx, drop_struct_lock);
+ ASSERT(dn->dn_maxblkid >= db->db_blkid);
+ }
+
if (db->db_level+1 < dn->dn_nlevels) {
dmu_buf_impl_t *parent = db->db_parent;
dbuf_dirty_record_t *di;
@@ -1115,7 +1147,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
dnode_t *dn = db->db_dnode;
uint64_t txg = tx->tx_txg;
- dbuf_dirty_record_t *dr;
+ dbuf_dirty_record_t *dr, **drp;
ASSERT(txg != 0);
ASSERT(db->db_blkid != DB_BONUS_BLKID);
@@ -1125,7 +1157,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
/*
* If this buffer is not dirty, we're done.
*/
- for (dr = db->db_last_dirty; dr; dr = dr->dr_next)
+ for (drp = &db->db_last_dirty; (dr = *drp) != NULL; drp = &dr->dr_next)
if (dr->dr_txg <= txg)
break;
if (dr == NULL || dr->dr_txg < txg) {
@@ -1155,14 +1187,14 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
/* XXX would be nice to fix up dn_towrite_space[] */
- db->db_last_dirty = dr->dr_next;
+ *drp = dr->dr_next;
if (dr->dr_parent) {
mutex_enter(&dr->dr_parent->dt.di.dr_mtx);
list_remove(&dr->dr_parent->dt.di.dr_children, dr);
mutex_exit(&dr->dr_parent->dt.di.dr_mtx);
} else if (db->db_level+1 == dn->dn_nlevels) {
- ASSERT3P(db->db_parent, ==, dn->dn_dbuf);
+ ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf);
mutex_enter(&dn->dn_mtx);
list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr);
mutex_exit(&dn->dn_mtx);
@@ -1178,8 +1210,8 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
} else {
ASSERT(db->db_buf != NULL);
ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
- list_destroy(&dr->dt.di.dr_children);
mutex_destroy(&dr->dt.di.dr_mtx);
+ list_destroy(&dr->dt.di.dr_children);
}
kmem_free(dr, sizeof (dbuf_dirty_record_t));
@@ -1204,7 +1236,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
void
dbuf_will_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
{
- int rf = DB_RF_MUST_SUCCEED;
+ int rf = DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH;
ASSERT(tx->tx_txg != 0);
ASSERT(!refcount_is_zero(&db->db_holds));
@@ -1282,8 +1314,10 @@ dbuf_clear(dmu_buf_impl_t *db)
if (db->db_state == DB_CACHED) {
ASSERT(db->db.db_data != NULL);
- if (db->db_blkid == DB_BONUS_BLKID)
+ if (db->db_blkid == DB_BONUS_BLKID) {
zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
+ arc_space_return(DN_MAX_BONUSLEN);
+ }
db->db.db_data = NULL;
db->db_state = DB_UNCACHED;
}
@@ -1297,6 +1331,7 @@ dbuf_clear(dmu_buf_impl_t *db)
if (db->db_blkid != DB_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
list_remove(&dn->dn_dbufs, db);
dnode_rele(dn, db);
+ db->db_dnode = NULL;
}
if (db->db_buf)
@@ -1397,10 +1432,13 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
if (blkid == DB_BONUS_BLKID) {
ASSERT3P(parent, ==, dn->dn_dbuf);
- db->db.db_size = dn->dn_bonuslen;
+ db->db.db_size = DN_MAX_BONUSLEN -
+ (dn->dn_nblkptr-1) * sizeof (blkptr_t);
+ ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen);
db->db.db_offset = DB_BONUS_BLKID;
db->db_state = DB_UNCACHED;
/* the bonus dbuf is not placed in the hash table */
+ arc_space_consume(sizeof (dmu_buf_impl_t));
return (db);
} else {
int blocksize =
@@ -1427,6 +1465,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
list_insert_head(&dn->dn_dbufs, db);
db->db_state = DB_UNCACHED;
mutex_exit(&dn->dn_dbufs_mtx);
+ arc_space_consume(sizeof (dmu_buf_impl_t));
if (parent && parent != dn->dn_dbuf)
dbuf_add_ref(parent, db);
@@ -1469,31 +1508,33 @@ dbuf_destroy(dmu_buf_impl_t *db)
ASSERT(refcount_is_zero(&db->db_holds));
if (db->db_blkid != DB_BONUS_BLKID) {
- dnode_t *dn = db->db_dnode;
-
/*
* If this dbuf is still on the dn_dbufs list,
* remove it from that list.
*/
- if (list_link_active(&db->db_link)) {
+ if (db->db_dnode) {
+ dnode_t *dn = db->db_dnode;
+
mutex_enter(&dn->dn_dbufs_mtx);
list_remove(&dn->dn_dbufs, db);
mutex_exit(&dn->dn_dbufs_mtx);
dnode_rele(dn, db);
+ db->db_dnode = NULL;
}
dbuf_hash_remove(db);
}
db->db_parent = NULL;
- db->db_dnode = NULL;
db->db_buf = NULL;
+ ASSERT(!list_link_active(&db->db_link));
ASSERT(db->db.db_data == NULL);
ASSERT(db->db_hash_next == NULL);
ASSERT(db->db_blkptr == NULL);
ASSERT(db->db_data_pending == NULL);
kmem_cache_free(dbuf_cache, db);
+ arc_space_return(sizeof (dmu_buf_impl_t));
}
void
@@ -1525,6 +1566,7 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
if (bp && !BP_IS_HOLE(bp)) {
+ arc_buf_t *pbuf;
uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
zbookmark_t zb;
zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
@@ -1533,9 +1575,13 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
zb.zb_level = 0;
zb.zb_blkid = blkid;
- (void) arc_read(NULL, dn->dn_objset->os_spa, bp,
- dmu_ot[dn->dn_type].ot_byteswap,
- NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
+ if (db)
+ pbuf = db->db_buf;
+ else
+ pbuf = dn->dn_objset->os_phys_buf;
+
+ (void) arc_read(NULL, dn->dn_objset->os_spa,
+ bp, pbuf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
&aflags, &zb);
}
@@ -1652,16 +1698,13 @@ dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag)
return (err ? NULL : db);
}
-dmu_buf_impl_t *
+void
dbuf_create_bonus(dnode_t *dn)
{
- dmu_buf_impl_t *db = dn->dn_bonus;
-
ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock));
ASSERT(dn->dn_bonus == NULL);
- db = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
- return (db);
+ dn->dn_bonus = dbuf_create(dn, 0, DB_BONUS_BLKID, dn->dn_dbuf, NULL);
}
#pragma weak dmu_buf_add_ref = dbuf_add_ref
@@ -1716,7 +1759,10 @@ dbuf_rele(dmu_buf_impl_t *db, void *tag)
dbuf_evict(db);
} else {
VERIFY(arc_buf_remove_ref(db->db_buf, db) == 0);
- mutex_exit(&db->db_mtx);
+ if (!DBUF_IS_CACHEABLE(db))
+ dbuf_clear(db);
+ else
+ mutex_exit(&db->db_mtx);
}
} else {
mutex_exit(&db->db_mtx);
@@ -1852,15 +1898,8 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
db->db_data_pending = dr;
- arc_release(db->db_buf, db);
mutex_exit(&db->db_mtx);
-
- /*
- * XXX -- we should design a compression algorithm
- * that specializes in arrays of bps.
- */
- dbuf_write(dr, db->db_buf, ZIO_CHECKSUM_FLETCHER_4,
- zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : ZIO_COMPRESS_LZJB, tx);
+ dbuf_write(dr, db->db_buf, tx);
zio = dr->dr_zio;
mutex_enter(&dr->dt.di.dr_mtx);
@@ -1878,7 +1917,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dnode_t *dn = db->db_dnode;
objset_impl_t *os = dn->dn_objset;
uint64_t txg = tx->tx_txg;
- int checksum, compress;
int blksz;
ASSERT(dmu_tx_is_syncing(tx));
@@ -1909,23 +1947,21 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
*/
if (db->db_blkid == DB_BONUS_BLKID) {
dbuf_dirty_record_t **drp;
- /*
- * Use dn_phys->dn_bonuslen since db.db_size is the length
- * of the bonus buffer in the open transaction rather than
- * the syncing transaction.
- */
+
ASSERT(*datap != NULL);
ASSERT3U(db->db_level, ==, 0);
ASSERT3U(dn->dn_phys->dn_bonuslen, <=, DN_MAX_BONUSLEN);
bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
- if (*datap != db->db.db_data)
+ if (*datap != db->db.db_data) {
zio_buf_free(*datap, DN_MAX_BONUSLEN);
+ arc_space_return(DN_MAX_BONUSLEN);
+ }
db->db_data_pending = NULL;
drp = &db->db_last_dirty;
while (*drp != dr)
drp = &(*drp)->dr_next;
- ASSERT((*drp)->dr_next == NULL);
- *drp = NULL;
+ ASSERT(dr->dr_next == NULL);
+ *drp = dr->dr_next;
if (dr->dr_dbuf->db_level != 0) {
list_destroy(&dr->dt.di.dr_children);
mutex_destroy(&dr->dt.di.dr_mtx);
@@ -1939,6 +1975,14 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
}
/*
+ * This function may have dropped the db_mtx lock allowing a dmu_sync
+ * operation to sneak in. As a result, we need to ensure that we
+ * don't check the dr_override_state until we have returned from
+ * dbuf_check_blkptr.
+ */
+ dbuf_check_blkptr(dn, db);
+
+ /*
* If this buffer is in the middle of an immdiate write,
* wait for the synchronous IO to complete.
*/
@@ -1948,8 +1992,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
}
- dbuf_check_blkptr(dn, db);
-
/*
* If this dbuf has already been written out via an immediate write,
* just complete the write by copying over the new block pointer and
@@ -1963,6 +2005,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
zio_fake.io_bp = db->db_blkptr;
zio_fake.io_bp_orig = *db->db_blkptr;
zio_fake.io_txg = txg;
+ zio_fake.io_flags = 0;
*db->db_blkptr = dr->dt.dl.dr_overridden_by;
dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
@@ -1970,8 +2013,12 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
dr->dr_zio = &zio_fake;
mutex_exit(&db->db_mtx);
+ ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp),
+ BP_IDENTITY(&zio_fake.io_bp_orig)) ||
+ BP_IS_HOLE(zio_fake.io_bp));
+
if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
- dsl_dataset_block_kill(os->os_dsl_dataset,
+ (void) dsl_dataset_block_kill(os->os_dsl_dataset,
&zio_fake.io_bp_orig, dn->dn_zio, tx);
dbuf_write_ready(&zio_fake, db->db_buf, db);
@@ -1997,14 +2044,6 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
bcopy(db->db.db_data, (*datap)->b_data, blksz);
}
- } else {
- /*
- * Private object buffers are released here rather
- * than in dbuf_dirty() since they are only modified
- * in the syncing context and we don't want the
- * overhead of making multiple copies of the data.
- */
- arc_release(db->db_buf, db);
}
ASSERT(*datap != NULL);
@@ -2012,22 +2051,7 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
mutex_exit(&db->db_mtx);
- /*
- * Allow dnode settings to override objset settings,
- * except for metadata checksums.
- */
- if (dmu_ot[dn->dn_type].ot_metadata) {
- checksum = os->os_md_checksum;
- compress = zio_compress_select(dn->dn_compress,
- os->os_md_compress);
- } else {
- checksum = zio_checksum_select(dn->dn_checksum,
- os->os_checksum);
- compress = zio_compress_select(dn->dn_compress,
- os->os_compress);
- }
-
- dbuf_write(dr, *datap, checksum, compress, tx);
+ dbuf_write(dr, *datap, tx);
ASSERT(!list_link_active(&dr->dr_dirty_node));
if (dn->dn_object == DMU_META_DNODE_OBJECT)
@@ -2063,8 +2087,7 @@ dbuf_sync_list(list_t *list, dmu_tx_t *tx)
}
static void
-dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
- int compress, dmu_tx_t *tx)
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
{
dmu_buf_impl_t *db = dr->dr_dbuf;
dnode_t *dn = db->db_dnode;
@@ -2072,8 +2095,23 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
dmu_buf_impl_t *parent = db->db_parent;
uint64_t txg = tx->tx_txg;
zbookmark_t zb;
+ writeprops_t wp = { 0 };
zio_t *zio;
- int zio_flags;
+
+ if (!BP_IS_HOLE(db->db_blkptr) &&
+ (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) {
+ /*
+ * Private object buffers are released here rather
+ * than in dbuf_dirty() since they are only modified
+ * in the syncing context and we don't want the
+ * overhead of making multiple copies of the data.
+ */
+ arc_release(data, db);
+ } else {
+ ASSERT(arc_released(data));
+ /* XXX why do we need to thaw here? */
+ arc_buf_thaw(data);
+ }
if (parent != dn->dn_dbuf) {
ASSERT(parent && parent->db_data_pending);
@@ -2096,17 +2134,22 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, int checksum,
zb.zb_level = db->db_level;
zb.zb_blkid = db->db_blkid;
- zio_flags = ZIO_FLAG_MUSTSUCCEED;
- if (dmu_ot[dn->dn_type].ot_metadata || zb.zb_level != 0)
- zio_flags |= ZIO_FLAG_METADATA;
+ wp.wp_type = dn->dn_type;
+ wp.wp_level = db->db_level;
+ wp.wp_copies = os->os_copies;
+ wp.wp_dncompress = dn->dn_compress;
+ wp.wp_oscompress = os->os_compress;
+ wp.wp_dnchecksum = dn->dn_checksum;
+ wp.wp_oschecksum = os->os_checksum;
+
if (BP_IS_OLDER(db->db_blkptr, txg))
- dsl_dataset_block_kill(
+ (void) dsl_dataset_block_kill(
os->os_dsl_dataset, db->db_blkptr, zio, tx);
- dr->dr_zio = arc_write(zio, os->os_spa, checksum, compress,
- dmu_get_replication_level(os, &zb, dn->dn_type), txg,
- db->db_blkptr, data, dbuf_write_ready, dbuf_write_done, db,
- ZIO_PRIORITY_ASYNC_WRITE, zio_flags, &zb);
+ dr->dr_zio = arc_write(zio, os->os_spa, &wp,
+ DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr,
+ data, dbuf_write_ready, dbuf_write_done, db,
+ ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
}
/* ARGSUSED */
@@ -2116,27 +2159,33 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
dmu_buf_impl_t *db = vdb;
dnode_t *dn = db->db_dnode;
objset_impl_t *os = dn->dn_objset;
+ blkptr_t *bp = zio->io_bp;
blkptr_t *bp_orig = &zio->io_bp_orig;
uint64_t fill = 0;
int old_size, new_size, i;
+ ASSERT(db->db_blkptr == bp);
+
dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
old_size = bp_get_dasize(os->os_spa, bp_orig);
- new_size = bp_get_dasize(os->os_spa, zio->io_bp);
+ new_size = bp_get_dasize(os->os_spa, bp);
- dnode_diduse_space(dn, new_size-old_size);
+ dnode_diduse_space(dn, new_size - old_size);
- if (BP_IS_HOLE(zio->io_bp)) {
+ if (BP_IS_HOLE(bp)) {
dsl_dataset_t *ds = os->os_dsl_dataset;
dmu_tx_t *tx = os->os_synctx;
if (bp_orig->blk_birth == tx->tx_txg)
- dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
- ASSERT3U(db->db_blkptr->blk_fill, ==, 0);
+ (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
+ ASSERT3U(bp->blk_fill, ==, 0);
return;
}
+ ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
+ ASSERT(BP_GET_LEVEL(bp) == db->db_level);
+
mutex_enter(&db->db_mtx);
if (db->db_level == 0) {
@@ -2156,32 +2205,31 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
fill = 1;
}
} else {
- blkptr_t *bp = db->db.db_data;
+ blkptr_t *ibp = db->db.db_data;
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
- for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, bp++) {
- if (BP_IS_HOLE(bp))
+ for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
+ if (BP_IS_HOLE(ibp))
continue;
- ASSERT3U(BP_GET_LSIZE(bp), ==,
+ ASSERT3U(BP_GET_LSIZE(ibp), ==,
db->db_level == 1 ? dn->dn_datablksz :
(1<<dn->dn_phys->dn_indblkshift));
- fill += bp->blk_fill;
+ fill += ibp->blk_fill;
}
}
- db->db_blkptr->blk_fill = fill;
- BP_SET_TYPE(db->db_blkptr, dn->dn_type);
- BP_SET_LEVEL(db->db_blkptr, db->db_level);
+ bp->blk_fill = fill;
mutex_exit(&db->db_mtx);
- /* We must do this after we've set the bp's type and level */
- if (!DVA_EQUAL(BP_IDENTITY(zio->io_bp), BP_IDENTITY(bp_orig))) {
+ if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+ ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
+ } else {
dsl_dataset_t *ds = os->os_dsl_dataset;
dmu_tx_t *tx = os->os_synctx;
if (bp_orig->blk_birth == tx->tx_txg)
- dsl_dataset_block_kill(ds, bp_orig, NULL, tx);
- dsl_dataset_block_born(ds, zio->io_bp, tx);
+ (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
+ dsl_dataset_block_born(ds, bp, tx);
}
}
@@ -2198,13 +2246,12 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
mutex_enter(&db->db_mtx);
drp = &db->db_last_dirty;
- while (*drp != db->db_data_pending)
- drp = &(*drp)->dr_next;
- ASSERT(!list_link_active(&(*drp)->dr_dirty_node));
- ASSERT((*drp)->dr_txg == txg);
- ASSERT((*drp)->dr_next == NULL);
- dr = *drp;
- *drp = NULL;
+ while ((dr = *drp) != db->db_data_pending)
+ drp = &dr->dr_next;
+ ASSERT(!list_link_active(&dr->dr_dirty_node));
+ ASSERT(dr->dr_txg == txg);
+ ASSERT(dr->dr_next == NULL);
+ *drp = dr->dr_next;
if (db->db_level == 0) {
ASSERT(db->db_blkid != DB_BONUS_BLKID);
@@ -2230,8 +2277,8 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
>> (db->db_level * epbs), >=, db->db_blkid);
arc_set_callback(db->db_buf, dbuf_do_evict, db);
}
- list_destroy(&dr->dt.di.dr_children);
mutex_destroy(&dr->dt.di.dr_mtx);
+ list_destroy(&dr->dt.di.dr_children);
}
kmem_free(dr, sizeof (dbuf_dirty_record_t));
OpenPOWER on IntegriCloud