summaryrefslogtreecommitdiffstats
path: root/sys/cddl/contrib/opensolaris/uts
diff options
context:
space:
mode:
authordelphij <delphij@FreeBSD.org>2013-06-11 19:02:36 +0000
committerdelphij <delphij@FreeBSD.org>2013-06-11 19:02:36 +0000
commit9d0815fcd162db9197ce3750a1e17c721ecbd315 (patch)
tree85382c064b2f1478b74ec96e01bb7b504ad9549b /sys/cddl/contrib/opensolaris/uts
parent0ae9c65c0f385ead30d33793507556ef4639f6e5 (diff)
downloadFreeBSD-src-9d0815fcd162db9197ce3750a1e17c721ecbd315.zip
FreeBSD-src-9d0815fcd162db9197ce3750a1e17c721ecbd315.tar.gz
MFV r251619:
ZFS needs better comments. Illumos ZFS issues: 3741 zfs needs better comments MFC after: 2 weeks
Diffstat (limited to 'sys/cddl/contrib/opensolaris/uts')
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c19
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c24
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c15
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c2
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h8
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c14
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c1
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c65
-rw-r--r--sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c5
11 files changed, 151 insertions, 12 deletions
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
index 8396276..6152144 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c
@@ -289,7 +289,18 @@ typedef struct arc_stats {
kstat_named_t arcstat_deleted;
kstat_named_t arcstat_stolen;
kstat_named_t arcstat_recycle_miss;
+ /*
+ * Number of buffers that could not be evicted because the hash lock
+ * was held by another thread. The lock may not necessarily be held
+ * by something using the same buffer, since hash locks are shared
+ * by multiple buffers.
+ */
kstat_named_t arcstat_mutex_miss;
+ /*
+ * Number of buffers skipped because they have I/O in progress, are
+ * indrect prefetch buffers that have not lived long enough, or are
+ * not from the spa we're trying to evict from.
+ */
kstat_named_t arcstat_evict_skip;
kstat_named_t arcstat_evict_l2_cached;
kstat_named_t arcstat_evict_l2_eligible;
@@ -3247,6 +3258,10 @@ top:
mutex_exit(hash_lock);
+ /*
+ * At this point, we have a level 1 cache miss. Try again in
+ * L2ARC if possible.
+ */
ASSERT3U(hdr->b_size, ==, size);
DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
uint64_t, size, zbookmark_t *, zb);
@@ -3488,8 +3503,8 @@ arc_buf_evict(arc_buf_t *buf)
}
/*
- * Release this buffer from the cache. This must be done
- * after a read and prior to modifying the buffer contents.
+ * Release this buffer from the cache, making it an anonymous buffer. This
+ * must be done after a read and prior to modifying the buffer contents.
* If the buffer has more than one reference, we must make
* a new hdr for the buffer.
*/
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
index cda8c17..28aa330 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dbuf.c
@@ -641,6 +641,14 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
if (!havepzio)
err = zio_wait(zio);
} else {
+ /*
+ * Another reader came in while the dbuf was in flight
+ * between UNCACHED and CACHED. Either a writer will finish
+ * writing the buffer (sending the dbuf to CACHED) or the
+ * first reader's request will reach the read_done callback
+ * and send the dbuf to CACHED. Otherwise, a failure
+ * occurred and the dbuf went to UNCACHED.
+ */
mutex_exit(&db->db_mtx);
if (prefetch)
dmu_zfetch(&dn->dn_zfetch, db->db.db_offset,
@@ -649,6 +657,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
rw_exit(&dn->dn_struct_rwlock);
DB_DNODE_EXIT(db);
+ /* Skip the wait per the caller's request. */
mutex_enter(&db->db_mtx);
if ((flags & DB_RF_NEVERWAIT) == 0) {
while (db->db_state == DB_READ ||
@@ -1264,7 +1273,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
}
/*
- * Return TRUE if this evicted the dbuf.
+ * Undirty a buffer in the transaction group referenced by the given
+ * transaction. Return whether this evicted the dbuf.
*/
static boolean_t
dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
@@ -2225,6 +2235,7 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
ASSERT(db->db_level > 0);
DBUF_VERIFY(db);
+ /* Read the block if it hasn't been read yet. */
if (db->db_buf == NULL) {
mutex_exit(&db->db_mtx);
(void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED);
@@ -2235,10 +2246,12 @@ dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
+ /* Indirect block size must match what the dnode thinks it is. */
ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
dbuf_check_blkptr(dn, db);
DB_DNODE_EXIT(db);
+ /* Provide the pending dirty record to child dbufs */
db->db_data_pending = dr;
mutex_exit(&db->db_mtx);
@@ -2629,6 +2642,7 @@ dbuf_write_override_done(zio_t *zio)
dbuf_write_done(zio, NULL, db);
}
+/* Issue I/O to commit a dirty buffer to disk. */
static void
dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
{
@@ -2663,11 +2677,19 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
}
if (parent != dn->dn_dbuf) {
+ /* Our parent is an indirect block. */
+ /* We have a dirty parent that has been scheduled for write. */
ASSERT(parent && parent->db_data_pending);
+ /* Our parent's buffer is one level closer to the dnode. */
ASSERT(db->db_level == parent->db_level-1);
+ /*
+ * We're about to modify our parent's db_data by modifying
+ * our block pointer, so the parent must be released.
+ */
ASSERT(arc_released(parent->db_buf));
zio = parent->db_data_pending->dr_zio;
} else {
+ /* Our parent is the dnode itself. */
ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 &&
db->db_blkid != DMU_SPILL_BLKID) ||
(db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0));
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
index 75716ed..fd0464e 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu.c
@@ -1839,7 +1839,7 @@ dmu_init(void)
void
dmu_fini(void)
{
- arc_fini();
+ arc_fini(); /* arc depends on l2arc, so arc must go first */
l2arc_fini();
zfetch_fini();
dbuf_fini();
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
index 3eeaca6..5338425 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_tx.c
@@ -1014,6 +1014,10 @@ dmu_tx_unassign(dmu_tx_t *tx)
txg_rele_to_quiesce(&tx->tx_txgh);
+ /*
+ * Walk the transaction's hold list, removing the hold on the
+ * associated dnode, and notifying waiters if the refcount drops to 0.
+ */
for (txh = list_head(&tx->tx_holds); txh != tx->tx_needassign_txh;
txh = list_next(&tx->tx_holds, txh)) {
dnode_t *dn = txh->txh_dnode;
@@ -1126,6 +1130,10 @@ dmu_tx_commit(dmu_tx_t *tx)
ASSERT(tx->tx_txg != 0);
+ /*
+ * Go through the transaction's hold list and remove holds on
+ * associated dnodes, notifying waiters if no holds remain.
+ */
while (txh = list_head(&tx->tx_holds)) {
dnode_t *dn = txh->txh_dnode;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
index b5ca666..8ab5e10 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/dmu_zfetch.c
@@ -66,11 +66,11 @@ SYSCTL_UQUAD(_vfs_zfs_zfetch, OID_AUTO, array_rd_sz, CTLFLAG_RDTUN,
"Number of bytes in a array_read at which we stop prefetching");
/* forward decls for static routines */
-static int dmu_zfetch_colinear(zfetch_t *, zstream_t *);
+static boolean_t dmu_zfetch_colinear(zfetch_t *, zstream_t *);
static void dmu_zfetch_dofetch(zfetch_t *, zstream_t *);
static uint64_t dmu_zfetch_fetch(dnode_t *, uint64_t, uint64_t);
static uint64_t dmu_zfetch_fetchsz(dnode_t *, uint64_t, uint64_t);
-static int dmu_zfetch_find(zfetch_t *, zstream_t *, int);
+static boolean_t dmu_zfetch_find(zfetch_t *, zstream_t *, int);
static int dmu_zfetch_stream_insert(zfetch_t *, zstream_t *);
static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *);
static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
@@ -122,9 +122,9 @@ kstat_t *zfetch_ksp;
* last stream, then we are probably in a strided access pattern. So
* combine the two sequential streams into a single strided stream.
*
- * If no co-linear streams are found, return NULL.
+ * Returns whether co-linear streams were found.
*/
-static int
+static boolean_t
dmu_zfetch_colinear(zfetch_t *zf, zstream_t *zh)
{
zstream_t *z_walk;
@@ -344,7 +344,7 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
* for this block read. If so, it starts a prefetch for the stream it
* located and returns true, otherwise it returns false
*/
-static int
+static boolean_t
dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
{
zstream_t *zs;
@@ -669,7 +669,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
{
zstream_t zst;
zstream_t *newstream;
- int fetched;
+ boolean_t fetched;
int inserted;
unsigned int blkshft;
uint64_t blksz;
@@ -695,7 +695,8 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
ZFETCHSTAT_BUMP(zfetchstat_hits);
} else {
ZFETCHSTAT_BUMP(zfetchstat_misses);
- if (fetched = dmu_zfetch_colinear(zf, &zst)) {
+ fetched = dmu_zfetch_colinear(zf, &zst);
+ if (fetched) {
ZFETCHSTAT_BUMP(zfetchstat_colinear_hits);
} else {
ZFETCHSTAT_BUMP(zfetchstat_colinear_misses);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
index c23fa0a..3740056 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c
@@ -27,6 +27,8 @@
*/
/*
+ * SPA: Storage Pool Allocator
+ *
* This file contains all the routines used when modifying on-disk SPA state.
* This includes opening, importing, destroying, exporting a pool, and syncing a
* pool.
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
index b67ff2d..eb7ed24 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/dmu.h
@@ -411,6 +411,8 @@ void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
* object must be held in an assigned transaction before calling
* dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus
* buffer as well. You must release your hold with dmu_buf_rele().
+ *
+ * Returns ENOENT, EIO, or 0.
*/
int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **);
int dmu_bonus_max(void);
@@ -666,8 +668,14 @@ extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS];
* If doi is NULL, just indicates whether the object exists.
*/
int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi);
+/* Like dmu_object_info, but faster if you have a held dnode in hand. */
void dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi);
+/* Like dmu_object_info, but faster if you have a held dbuf in hand. */
void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi);
+/*
+ * Like dmu_object_info_from_db, but faster still when you only care about
+ * the size. This is specifically optimized for zfs_getattr().
+ */
void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize,
u_longlong_t *nblk512);
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
index 46b5c34..5a83ee2 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/txg.c
@@ -348,6 +348,12 @@ txg_rele_to_sync(txg_handle_t *th)
th->th_cpu = NULL; /* defensive */
}
+/*
+ * Blocks until all transactions in the group are committed.
+ *
+ * On return, the transaction group has reached a stable state in which it can
+ * then be passed off to the syncing context.
+ */
static void
txg_quiesce(dsl_pool_t *dp, uint64_t txg)
{
@@ -397,6 +403,9 @@ txg_do_callbacks(void *arg)
/*
* Dispatch the commit callbacks registered on this txg to worker threads.
+ *
+ * If no callbacks are registered for a given TXG, nothing happens.
+ * This function creates a taskq for the associated pool, if needed.
*/
static void
txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
@@ -407,7 +416,10 @@ txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
for (c = 0; c < max_ncpus; c++) {
tx_cpu_t *tc = &tx->tx_cpu[c];
- /* No need to lock tx_cpu_t at this point */
+ /*
+ * No need to lock tx_cpu_t at this point, since this can
+ * only be called once a txg has been synced.
+ */
int g = txg & TXG_MASK;
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
index 92ae0ed..f4e86a1 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_label.c
@@ -1044,6 +1044,7 @@ vdev_uberblock_sync(zio_t *zio, uberblock_t *ub, vdev_t *vd, int flags)
zio_buf_free(ubbuf, VDEV_UBERBLOCK_SIZE(vd));
}
+/* Sync the uberblocks to all vdevs in svd[] */
int
vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags)
{
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
index 1cc343a..0a107de 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c
@@ -433,23 +433,50 @@ static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
vdev_raidz_cksum_report
};
+/*
+ * Divides the IO evenly across all child vdevs; usually, dcols is
+ * the number of children in the target vdev.
+ */
static raidz_map_t *
vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
uint64_t nparity)
{
raidz_map_t *rm;
+ /* The starting RAIDZ (parent) vdev sector of the block. */
uint64_t b = zio->io_offset >> unit_shift;
+ /* The zio's size in units of the vdev's minimum sector size. */
uint64_t s = zio->io_size >> unit_shift;
+ /* The first column for this stripe. */
uint64_t f = b % dcols;
+ /* The starting byte offset on each child vdev. */
uint64_t o = (b / dcols) << unit_shift;
uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
+ /*
+ * "Quotient": The number of data sectors for this stripe on all but
+ * the "big column" child vdevs that also contain "remainder" data.
+ */
q = s / (dcols - nparity);
+
+ /*
+ * "Remainder": The number of partial stripe data sectors in this I/O.
+ * This will add a sector to some, but not all, child vdevs.
+ */
r = s - q * (dcols - nparity);
+
+ /* The number of "big columns" - those which contain remainder data. */
bc = (r == 0 ? 0 : r + nparity);
+
+ /*
+ * The total number of data and parity sectors associated with
+ * this I/O.
+ */
tot = s + nparity * (q + (r == 0 ? 0 : 1));
+ /* acols: The columns that will be accessed. */
+ /* scols: The columns that will be accessed or skipped. */
if (q == 0) {
+ /* Our I/O request doesn't span all child vdevs. */
acols = bc;
scols = MIN(dcols, roundup(bc, nparity + 1));
} else {
@@ -1529,6 +1556,23 @@ vdev_raidz_child_done(zio_t *zio)
rc->rc_skipped = 0;
}
+/*
+ * Start an IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * - For write operations:
+ * 1. Generate the parity data
+ * 2. Create child zio write operations to each column's vdev, for both
+ * data and parity.
+ * 3. If the column skips any sectors for padding, create optional dummy
+ * write zio children for those areas to improve aggregation continuity.
+ * - For read operations:
+ * 1. Create child zio read operations to each data column's vdev to read
+ * the range of data required for zio.
+ * 2. If this is a scrub or resilver operation, or if any of the data
+ * vdevs have had errors, then create zio read operations to the parity
+ * columns' VDevs as well.
+ */
static int
vdev_raidz_io_start(zio_t *zio)
{
@@ -1881,6 +1925,27 @@ done:
return (ret);
}
+/*
+ * Complete an IO operation on a RAIDZ VDev
+ *
+ * Outline:
+ * - For write operations:
+ * 1. Check for errors on the child IOs.
+ * 2. Return, setting an error code if too few child VDevs were written
+ * to reconstruct the data later. Note that partial writes are
+ * considered successful if they can be reconstructed at all.
+ * - For read operations:
+ * 1. Check for errors on the child IOs.
+ * 2. If data errors occurred:
+ * a. Try to reassemble the data from the parity available.
+ * b. If we haven't yet read the parity drives, read them now.
+ * c. If all parity drives have been read but the data still doesn't
+ * reassemble with a correct checksum, then try combinatorial
+ * reconstruction.
+ * d. If that doesn't work, return an error.
+ * 3. If there were unexpected errors or this is a resilver operation,
+ * rewrite the vdevs that had errors.
+ */
static void
vdev_raidz_io_done(zio_t *zio)
{
diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
index c155e47..3961c48 100644
--- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
+++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_ctldir.c
@@ -633,6 +633,11 @@ static struct vop_vector zfsctl_ops_root = {
.vop_fid = zfsctl_common_fid,
};
+/*
+ * Gets the full dataset name that corresponds to the given snapshot name
+ * Example:
+ * zfsctl_snapshot_zname("snap1") -> "mypool/myfs@snap1"
+ */
static int
zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname)
{
OpenPOWER on IntegriCloud