summaryrefslogtreecommitdiffstats
path: root/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
diff options
context:
space:
mode:
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c285
1 files changed, 231 insertions, 54 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
index d3be6b4..377efb9 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -19,12 +19,10 @@
* CDDL HEADER END
*/
/*
- * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc. All rights reserved.
* Use is subject to license terms.
*/
-#pragma ident "%Z%%M% %I% %E% SMI"
-
#include <sys/dmu.h>
#include <sys/dmu_impl.h>
#include <sys/dmu_tx.h>
@@ -42,6 +40,7 @@
#include <sys/zfs_ioctl.h>
#include <sys/zap.h>
#include <sys/zio_checksum.h>
+#include <sys/zfs_znode.h>
const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ byteswap_uint8_array, TRUE, "unallocated" },
@@ -62,7 +61,7 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ zap_byteswap, TRUE, "DSL props" },
{ byteswap_uint64_array, TRUE, "DSL dataset" },
{ zfs_znode_byteswap, TRUE, "ZFS znode" },
- { zfs_acl_byteswap, TRUE, "ZFS ACL" },
+ { zfs_oldacl_byteswap, TRUE, "ZFS V0 ACL" },
{ byteswap_uint8_array, FALSE, "ZFS plain file" },
{ zap_byteswap, TRUE, "ZFS directory" },
{ zap_byteswap, TRUE, "ZFS master node" },
@@ -75,7 +74,14 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
{ zap_byteswap, TRUE, "persistent error log" },
{ byteswap_uint8_array, TRUE, "SPA history" },
{ byteswap_uint64_array, TRUE, "SPA history offsets" },
- { zap_byteswap, TRUE, "Pool properties" },
+ { zap_byteswap, TRUE, "Pool properties" },
+ { zap_byteswap, TRUE, "DSL permissions" },
+ { zfs_acl_byteswap, TRUE, "ZFS ACL" },
+ { byteswap_uint8_array, TRUE, "ZFS SYSACL" },
+ { byteswap_uint8_array, TRUE, "FUID table" },
+ { byteswap_uint64_array, TRUE, "FUID table size" },
+ { zap_byteswap, TRUE, "DSL dataset next clones"},
+ { zap_byteswap, TRUE, "scrub work queue" },
};
int
@@ -115,6 +121,19 @@ dmu_bonus_max(void)
return (DN_MAX_BONUSLEN);
}
+int
+dmu_set_bonus(dmu_buf_t *db, int newsize, dmu_tx_t *tx)
+{
+ dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
+
+ if (dn->dn_bonus != (dmu_buf_impl_t *)db)
+ return (EINVAL);
+ if (newsize < 0 || newsize > db->db_size)
+ return (EINVAL);
+ dnode_setbonuslen(dn, newsize, tx);
+ return (0);
+}
+
/*
* returns ENOENT, EIO, or 0.
*/
@@ -122,27 +141,27 @@ int
dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
{
dnode_t *dn;
- int err, count;
dmu_buf_impl_t *db;
+ int error;
- err = dnode_hold(os->os, object, FTAG, &dn);
- if (err)
- return (err);
+ error = dnode_hold(os->os, object, FTAG, &dn);
+ if (error)
+ return (error);
rw_enter(&dn->dn_struct_rwlock, RW_READER);
if (dn->dn_bonus == NULL) {
rw_exit(&dn->dn_struct_rwlock);
rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
if (dn->dn_bonus == NULL)
- dn->dn_bonus = dbuf_create_bonus(dn);
+ dbuf_create_bonus(dn);
}
db = dn->dn_bonus;
rw_exit(&dn->dn_struct_rwlock);
- mutex_enter(&db->db_mtx);
- count = refcount_add(&db->db_holds, tag);
- mutex_exit(&db->db_mtx);
- if (count == 1)
- dnode_add_ref(dn, db);
+
+ /* as long as the bonus buf is held, the dnode will be held */
+ if (refcount_add(&db->db_holds, tag) == 1)
+ VERIFY(dnode_add_ref(dn, db));
+
dnode_rele(dn, FTAG);
VERIFY(0 == dbuf_read(db, NULL, DB_RF_MUST_SUCCEED));
@@ -161,11 +180,13 @@ static int
dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
{
+ dsl_pool_t *dp = NULL;
dmu_buf_t **dbp;
uint64_t blkid, nblks, i;
uint32_t flags;
int err;
zio_t *zio;
+ hrtime_t start;
ASSERT(length <= DMU_MAX_ACCESS);
@@ -192,7 +213,11 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
}
dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP);
- zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, TRUE);
+ if (dn->dn_objset->os_dsl_dataset)
+ dp = dn->dn_objset->os_dsl_dataset->ds_dir->dd_pool;
+ if (dp && dsl_pool_sync_context(dp))
+ start = gethrtime();
+ zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL);
blkid = dbuf_whichblock(dn, offset);
for (i = 0; i < nblks; i++) {
dmu_buf_impl_t *db = dbuf_hold(dn, blkid+i, tag);
@@ -214,6 +239,9 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
/* wait for async i/o */
err = zio_wait(zio);
+ /* track read overhead when we are in sync context */
+ if (dp && dsl_pool_sync_context(dp))
+ dp->dp_read_overhead += gethrtime() - start;
if (err) {
dmu_buf_rele_array(dbp, nblks, tag);
return (err);
@@ -343,6 +371,155 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
dnode_rele(dn, FTAG);
}
+static int
+get_next_chunk(dnode_t *dn, uint64_t *offset, uint64_t limit)
+{
+ uint64_t len = *offset - limit;
+ uint64_t chunk_len = dn->dn_datablksz * DMU_MAX_DELETEBLKCNT;
+ uint64_t subchunk =
+ dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
+
+ ASSERT(limit <= *offset);
+
+ if (len <= chunk_len) {
+ *offset = limit;
+ return (0);
+ }
+
+ ASSERT(ISP2(subchunk));
+
+ while (*offset > limit) {
+ uint64_t initial_offset = P2ROUNDUP(*offset, subchunk);
+ uint64_t delta;
+ int err;
+
+ /* skip over allocated data */
+ err = dnode_next_offset(dn,
+ DNODE_FIND_HOLE|DNODE_FIND_BACKWARDS, offset, 1, 1, 0);
+ if (err == ESRCH)
+ *offset = limit;
+ else if (err)
+ return (err);
+
+ ASSERT3U(*offset, <=, initial_offset);
+ *offset = P2ALIGN(*offset, subchunk);
+ delta = initial_offset - *offset;
+ if (delta >= chunk_len) {
+ *offset += delta - chunk_len;
+ return (0);
+ }
+ chunk_len -= delta;
+
+ /* skip over unallocated data */
+ err = dnode_next_offset(dn,
+ DNODE_FIND_BACKWARDS, offset, 1, 1, 0);
+ if (err == ESRCH)
+ *offset = limit;
+ else if (err)
+ return (err);
+
+ if (*offset < limit)
+ *offset = limit;
+ ASSERT3U(*offset, <, initial_offset);
+ }
+ return (0);
+}
+
+static int
+dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
+ uint64_t length, boolean_t free_dnode)
+{
+ dmu_tx_t *tx;
+ uint64_t object_size, start, end, len;
+ boolean_t trunc = (length == DMU_OBJECT_END);
+ int align, err;
+
+ align = 1 << dn->dn_datablkshift;
+ ASSERT(align > 0);
+ object_size = align == 1 ? dn->dn_datablksz :
+ (dn->dn_maxblkid + 1) << dn->dn_datablkshift;
+
+ if (trunc || (end = offset + length) > object_size)
+ end = object_size;
+ if (end <= offset)
+ return (0);
+ length = end - offset;
+
+ while (length) {
+ start = end;
+ err = get_next_chunk(dn, &start, offset);
+ if (err)
+ return (err);
+ len = trunc ? DMU_OBJECT_END : end - start;
+
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_free(tx, dn->dn_object, start, len);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err) {
+ dmu_tx_abort(tx);
+ return (err);
+ }
+
+ dnode_free_range(dn, start, trunc ? -1 : len, tx);
+
+ if (start == 0 && free_dnode) {
+ ASSERT(trunc);
+ dnode_free(dn, tx);
+ }
+
+ length -= end - start;
+
+ dmu_tx_commit(tx);
+ end = start;
+ }
+ return (0);
+}
+
+int
+dmu_free_long_range(objset_t *os, uint64_t object,
+ uint64_t offset, uint64_t length)
+{
+ dnode_t *dn;
+ int err;
+
+ err = dnode_hold(os->os, object, FTAG, &dn);
+ if (err != 0)
+ return (err);
+ err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
+int
+dmu_free_object(objset_t *os, uint64_t object)
+{
+ dnode_t *dn;
+ dmu_tx_t *tx;
+ int err;
+
+ err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+ FTAG, &dn);
+ if (err != 0)
+ return (err);
+ if (dn->dn_nlevels == 1) {
+ tx = dmu_tx_create(os);
+ dmu_tx_hold_bonus(tx, object);
+ dmu_tx_hold_free(tx, dn->dn_object, 0, DMU_OBJECT_END);
+ err = dmu_tx_assign(tx, TXG_WAIT);
+ if (err == 0) {
+ dnode_free_range(dn, 0, DMU_OBJECT_END, tx);
+ dnode_free(dn, tx);
+ dmu_tx_commit(tx);
+ } else {
+ dmu_tx_abort(tx);
+ }
+ } else {
+ err = dmu_free_long_range_impl(os, dn, 0, DMU_OBJECT_END, TRUE);
+ }
+ dnode_rele(dn, FTAG);
+ return (err);
+}
+
int
dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
uint64_t size, dmu_tx_t *tx)
@@ -384,7 +561,6 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
while (size > 0) {
uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
- int err;
/*
* NB: we could do this block-at-a-time, but it's nice
@@ -393,7 +569,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
TRUE, FTAG, &numbufs, &dbp);
if (err)
- return (err);
+ break;
for (i = 0; i < numbufs; i++) {
int tocpy;
@@ -414,7 +590,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
dmu_buf_rele_array(dbp, numbufs, FTAG);
}
dnode_rele(dn, FTAG);
- return (0);
+ return (err);
}
void
@@ -590,9 +766,9 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
for (copied = 0; copied < tocpy; copied += PAGESIZE) {
ASSERT3U(pp->p_offset, ==, db->db_offset + bufoff);
thiscpy = MIN(PAGESIZE, tocpy - copied);
- va = ppmapin(pp, PROT_READ, (caddr_t)-1);
+ va = zfs_map_page(pp, S_READ);
bcopy(va, (char *)db->db_data + bufoff, thiscpy);
- ppmapout(va);
+ zfs_unmap_page(pp, va);
pp = pp->p_next;
bufoff += PAGESIZE;
}
@@ -620,6 +796,22 @@ typedef struct {
/* ARGSUSED */
static void
+dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
+{
+ blkptr_t *bp = zio->io_bp;
+
+ if (!BP_IS_HOLE(bp)) {
+ dmu_sync_arg_t *in = varg;
+ dbuf_dirty_record_t *dr = in->dr;
+ dmu_buf_impl_t *db = dr->dr_dbuf;
+ ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type);
+ ASSERT(BP_GET_LEVEL(bp) == 0);
+ bp->blk_fill = 1;
+ }
+}
+
+/* ARGSUSED */
+static void
dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
{
dmu_sync_arg_t *in = varg;
@@ -627,12 +819,6 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
dmu_buf_impl_t *db = dr->dr_dbuf;
dmu_sync_cb_t *done = in->done;
- if (!BP_IS_HOLE(zio->io_bp)) {
- zio->io_bp->blk_fill = 1;
- BP_SET_TYPE(zio->io_bp, db->db_dnode->dn_type);
- BP_SET_LEVEL(zio->io_bp, 0);
- }
-
mutex_enter(&db->db_mtx);
ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */
@@ -679,14 +865,13 @@ dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
dbuf_dirty_record_t *dr;
dmu_sync_arg_t *in;
zbookmark_t zb;
+ writeprops_t wp = { 0 };
zio_t *zio;
- int zio_flags;
int err;
ASSERT(BP_IS_HOLE(bp));
ASSERT(txg != 0);
-
dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
@@ -791,15 +976,20 @@ dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
zb.zb_object = db->db.db_object;
zb.zb_level = db->db_level;
zb.zb_blkid = db->db_blkid;
- zio_flags = ZIO_FLAG_MUSTSUCCEED;
- if (dmu_ot[db->db_dnode->dn_type].ot_metadata || zb.zb_level != 0)
- zio_flags |= ZIO_FLAG_METADATA;
- zio = arc_write(pio, os->os_spa,
- zio_checksum_select(db->db_dnode->dn_checksum, os->os_checksum),
- zio_compress_select(db->db_dnode->dn_compress, os->os_compress),
- dmu_get_replication_level(os, &zb, db->db_dnode->dn_type),
- txg, bp, dr->dt.dl.dr_data, NULL, dmu_sync_done, in,
- ZIO_PRIORITY_SYNC_WRITE, zio_flags, &zb);
+
+ wp.wp_type = db->db_dnode->dn_type;
+ wp.wp_level = db->db_level;
+ wp.wp_copies = os->os_copies;
+ wp.wp_dnchecksum = db->db_dnode->dn_checksum;
+ wp.wp_oschecksum = os->os_checksum;
+ wp.wp_dncompress = db->db_dnode->dn_compress;
+ wp.wp_oscompress = os->os_compress;
+
+ ASSERT(BP_IS_HOLE(bp));
+
+ zio = arc_write(pio, os->os_spa, &wp, DBUF_IS_L2CACHEABLE(db),
+ txg, bp, dr->dt.dl.dr_data, dmu_sync_ready, dmu_sync_done, in,
+ ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
if (pio) {
zio_nowait(zio);
@@ -855,21 +1045,6 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
}
int
-dmu_get_replication_level(objset_impl_t *os,
- zbookmark_t *zb, dmu_object_type_t ot)
-{
- int ncopies = os->os_copies;
-
- /* If it's the mos, it should have max copies set. */
- ASSERT(zb->zb_objset != 0 ||
- ncopies == spa_max_replication(os->os_spa));
-
- if (dmu_ot[ot].ot_metadata || zb->zb_level != 0)
- ncopies++;
- return (MIN(ncopies, spa_max_replication(os->os_spa)));
-}
-
-int
dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
{
dnode_t *dn;
@@ -894,7 +1069,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
return (err);
}
- err = dnode_next_offset(dn, hole, off, 1, 1, 0);
+ err = dnode_next_offset(dn, (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0);
dnode_rele(dn, FTAG);
return (err);
@@ -1018,6 +1193,7 @@ dmu_init(void)
dbuf_init();
dnode_init();
arc_init();
+ l2arc_init();
}
void
@@ -1026,4 +1202,5 @@ dmu_fini(void)
arc_fini();
dnode_fini();
dbuf_fini();
+ l2arc_fini();
}
OpenPOWER on IntegriCloud